In [63]:
from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
from pprint import PrettyPrinter
import sys

In [5]:
pp = PrettyPrinter(indent=2)

In [None]:
sample = pd.read_csv('GIIRScraper_04_14_2022__12_15_27.csv')
sample.head()

In [None]:
sample.info()

In [102]:
def scrape_data(lower='', upper=''):
    # SCRAPE DATA
    browser = webdriver.Chrome()
    browser.get('https://www.giiresearch.com/material_report.shtml')

    # data columns
    data = dict()
    data['Published_Date'] = []
    data['Category'] = []
    data['Report_Title'] = []
    data['Summary'] = []
    data['No_of_Pages'] = []
    data['Table_of_Contents'] = []
    data['List_of_Tables'] = []


    # get link to each record on search page
    links = []
    tables = browser.find_elements(by=By.CLASS_NAME, value='plist_item')

    for table in tables:
        links.append(table.find_element(By.CLASS_NAME, 'plist_title')
                        .find_element(By.CLASS_NAME, 'plist_t_box')
                        .find_element(By.CLASS_NAME, 'list_title')
                        .find_element(By.TAG_NAME, 'a').get_attribute('href'))

    # get record data from each page
    for link in links:
        # date
        browser.get(link)
        date = (browser
                .find_element(By.CSS_SELECTOR, '#Content_Body > div.prodinfo_body > div.prod_info_box > nobr:nth-child(1) > span > time')
                .text)
        data['Published_Date'].append(date)

        # title
        title = (browser
                 .find_element(By.CSS_SELECTOR, '#Content_Body > div.prodinfo_body > table > tbody > tr > td.prdinfo_title > h1 > span')
                 .text
        )
        data['Report_Title'].append(title)

        # industry
        industry = browser.find_element(By.CSS_SELECTOR, '#Body_Bread > div > a:nth-child(3)').text
        data['Category'].append(industry)

        # summary 
        summary = browser.find_element(By.CSS_SELECTOR, '#INTRODUCTION > div.cntSecContent').text
        data['Summary'].append(summary)

        # No of pages
        try:
            p_nos =(int(browser.find_element(By.CSS_SELECTOR, '#Content_Body > div.prodinfo_body > div.prod_info_box > nobr:nth-child(5) > span')
                    .text.split(' ')[0]))
        except ValueError:
            p_nos = np.NaN
        except exceptions.NoSuchElementException:
            p_nos = np.NaN
        data['No_of_Pages'].append(p_nos)
        # Table of contents
        try:
            browser.find_element(By.ID, 'Tab').find_elements(By.TAG_NAME, 'li')[1].click()
            t_o_c = browser.find_element(By.ID, 'TOC').text
        except:
            t_o_c=np.NaN
        data['Table_of_Contents'].append(t_o_c)

        # List of Tables
        try:
            browser.find_element(By.ID, 'Tab').find_elements(By.TAG_NAME, 'li')[2].click()
            l_o_t = browser.find_element(By.ID, 'LOT').text
        except:
            l_o_t=np.NaN
        data['List_of_Tables'].append(l_o_t)
    browser.close()

    # CONVERT TO DATAFRAME
    df = pd.DataFrame(data)
    df.Published_Date = pd.to_datetime(df.Published_Date) # convert date to date time
    
    # FILTER TABLE BASED ON GIVEN DATE
    if high and low: # check for date range arguements
        df = df[df.Published_Date[(df.Published_Date >= low)] <= high] # filter 
    elif low: 
        df = df[(df.Published_Date >= low)] # filter
        
    # convert date obj to str again
    df.Published_Date = df.Published_Date.strftime('%B %d, %Y')
    # SAVE TABLE AS CSV FILE
    df.to_csv('GIIR_records')
    
    

In [105]:
if '__main__' == __name__:
    print('Enter dates below | Enter q to quit')
    low = input('Low e.g. January 1, 2022: ')
    high = input('High e.g. January 1, 2022: ')
    
    if low == '' and high == '':
        low, high = None, None
        
    low = pd.to_datetime(low) 
    high = pd.to_datetime(high)
    scrape_data(low, high)
    print('GIIR csv file saved!')
    sys.exit()

Enter dates below | Enter q to quit


Low e.g. January 1, 2022:  
High e.g. January 1, 2022:  


KeyboardInterrupt: 

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Published_Date     25 non-null     datetime64[ns]
 1   Category           25 non-null     object        
 2   Report_Title       25 non-null     object        
 3   Summary            25 non-null     object        
 4   No_of_Pages        24 non-null     float64       
 5   Table_of_Contents  25 non-null     object        
 6   List_of_Tables     16 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 1.5+ KB


In [93]:
df.Published_Date = pd.to_datetime(df.Published_Date, format='%B %d, %Y')

In [95]:
df[df.Published_Date[(df.Published_Date > pd.to_datetime('April 10, 2022'))] <= pd.to_datetime('April 13, 2022')]


Unnamed: 0,Published_Date,Category,Report_Title,Summary,No_of_Pages,Table_of_Contents,List_of_Tables
19,2022-04-13,Industrial Machinery,"Hollow Fiber Filtration Market Size, Share & T...",Global Hollow Fiber Filtration Market projecte...,213.0,Product Code: BIOT2215\nTable of Contents\n1. ...,
20,2022-04-13,Advanced Material,"Aerospace Wires & Cables Market Size, Share, T...",Market Insights\nWires & cables are often cons...,138.0,Product Code: SRAD242\nTable of Contents\nRepo...,
21,2022-04-13,Advanced Material,Additive Technology Markets for Engineered Sol...,Engineering Solid Dosage Forms via Additive Te...,,Product Code: SDT654F\nTable of Contents\nExec...,
22,2022-04-12,Advanced Material,"Pine-Derived Chemicals Market by Type (TOFA, T...","According to MarketsandMarkets, the pine-deriv...",256.0,Product Code: FB 3604\nTABLE OF CONTENTS\n1 IN...,
23,2022-04-12,Advanced Material,"Synthetic Paper Market by Raw Material (BOPP, ...",The growth of the global synthetic paper marke...,193.0,Product Code: CH 3712\nTABLE OF CONTENTS\n1 IN...,
24,2022-04-12,Advanced Material,"Surface Disinfectant Market by Composition, Fo...",Surface Disinfectants Market by Composition (A...,245.0,Product Code: MRHC -104564\nTABLE OF CONTENTS\...,List of Tables\nTable 1 Global Surface Disinfe...


In [84]:
while True:
    print('Enter dates | Enter q to quit')
    low = input('Low e.g. January 1, 2022: ')
    high = input('High e.g. January 1, 2022: ')
    
    if low or high == 'q':
        break
        sys.exit()  
        
    if len(low) > 0 and len(high) > 0:
        try:
            low = pd.to_datetime(low) 
            high = pd.to_datetime(high)
            break
        except:
            print('Input correct data format')
            continue

Enter dates | Enter q to quit


Low e.g. January 1, 2022:  dd
High e.g. January 1, 2022:  dd


Enter dates | Enter q to quit


Low e.g. January 1, 2022:  dd
High e.g. January 1, 2022:  dd


Enter dates | Enter q to quit


Low e.g. January 1, 2022:  q
High e.g. January 1, 2022:  


Enter dates | Enter q to quit


Low e.g. January 1, 2022:  ii
High e.g. January 1, 2022:  


Enter dates | Enter q to quit


KeyboardInterrupt: Interrupted by user

In [96]:
df.head()

Unnamed: 0,Published_Date,Category,Report_Title,Summary,No_of_Pages,Table_of_Contents,List_of_Tables
0,2022-04-18,Advanced Material,"Global Synthetic Fibres (Polyester, Nylon, Acr...",The global synthetic fibres market is expected...,105.0,Table of Contents\n1. Market Overview\n1.1 Int...,
1,2022-04-18,Advanced Material,Global Helium Market: Insights & Forecast with...,The global helium market is forecasted to reac...,87.0,Table of Contents\n1. Market Overview\n1.1 Hel...,
2,2022-04-18,Advanced Material,Global Petrochemical Market (By Type & Region)...,The global petrochemical market is forecasted ...,104.0,Table of Contents\n1. Market Overview\n1.1 Pet...,
3,2022-04-18,Advanced Material,"Global Surfactants Market (Non-ionic, Anionic ...",The global surfactants market value is forecas...,116.0,Table of Contents\n1. Market Overview\n1.1 Int...,
4,2022-04-14,Advanced Material,Re-Refined Base Oil Market - Global Outlook & ...,The In-depth Analysis and Data-driven Insights...,294.0,Product Code: ARZ220303\nTABLE OF CONTENTS\n1 ...,LIST OF EXHIBITS\nEXHIBIT 1 SEGMENTATION OF GL...


In [92]:
df.to_csv('GIIR_records.csv', index=False)

In [101]:
pd.to_datetime('April 5, 2022').strftime('%B %d, %Y')

str

In [103]:
pd.to_datetime(None)