In [1]:
import pandas as pd
import numpy as np
import regex

from GoogleNews import GoogleNews
from newspaper import Article
from newspaper import Config

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from selenium.webdriver.chrome.options import Options

from random import randint
from time import sleep 

import requests
from bs4 import BeautifulSoup

from datetime import datetime
from dateutil.relativedelta import relativedelta

import os

In [2]:
class en_webscraper():
    """
    Extracts English article data from GoogleNews

    Attributes
    ----------
    search_term : str
        Term to search in Google
    start_date : str
        Date of earliest article in format MM/DD/YYYY
    end_date : str
        Date of latest article in format MM/DD/YYYY (default today) 
    
    """
    def __init__(self, search_term, start_date, end_date = datetime.today().strftime('%d/%m/%Y')):
        self.df = pd.DataFrame(columns=['title', 'media', 'google_date', "converted_date", 'link', 'article'])
        self.start_date = start_date
        self.end_date = end_date 
        self.search_term = search_term

        # Set up your user agent for making HTTP requests
        user_agent = "INSERT HERE" #https://www.whatismybrowser.com/detect/what-is-my-user-agent 
        self.config = Config()
        self.config.browser_user_agent = user_agent

        self.lang = "en"
        self.start_date = "".join(start_date.split("/"))
        self.end_date = "".join(end_date.split("/"))

    def search_google(self):
        """"Retrieves all the search results from Google page by page"""

        googlenews=GoogleNews(start=self.start_date, end=self.end_date)
        googlenews.search(self.search_term)
        for i in range(30):
            result= googlenews.page_at(i+1)
            if not result:
                break
            else:
                self.df = self.df.append(result) 
            sleep(randint(10,30))

        self.df = self.df.set_index(np.arange(0, len(self.df)))
        self.df = self.df.drop(columns = ["desc", "img", "datetime"])
        self.df["lang"] = "EN"

        self.get_articles()
        
    def string_to_date(self, s):
        """
        Converts string to date time objects

        Input: 
        s (str): Date as string
        """

        if "Sept" in s:
                s = s.replace("Sept", "Sep")
        if "ago" in s:
            parsed_s = [s.split()[:2]]
            if parsed_s[0][-1][-1] != "s":
                parsed_s[0][-1] += "s"
            time_dict = dict((fmt,float(amount)) for amount,fmt in parsed_s)
            dt = relativedelta(**time_dict)
            date = datetime.now() - dt
            return date.strftime('%Y-%m-%d')
        else:
            return datetime.strptime(s, '%d %b %Y').strftime('%Y-%m-%d')

    def find_date(self, url):
        """Searches metadata of url to find the publishing date"""

        r= requests.get(url)
        soup =BeautifulSoup(r.text, "html.parser")
        data = soup.findAll('script')
        try:
            date = regex.search(r'\d{4}-\d{2}-\d{2}', str(data))[0]
        except:
            date = np.nan
        return date

    def get_articles(self):
        """Extracts the article text and date"""

        for i in range(len(self.df)):
            article = Article(self.df["link"][i], config=self.config)
            article.download()
            try:
                article.parse()
                self.df["article"][i] = article.text.replace("\n", " ")
            except:
                pass

            if "month" in self.df.date[i] or "week" in self.df.date[i]:
                self.df.converted_date[i] = self.find_date(self.df.link[i])
            else:
                self.df.converted_date[i] = self.string_to_date(self.df.date[i])

        self.export_df()

    def export_df(self):
        """Saves scraped article data as a csv and pkl file"""
        
        self.df.to_csv(f'{self.lang}_articles_{"_".join(self.search_term.split(" "))}_{self.start_date}_{self.end_date}.csv', 
            index = False)
        self.df.to_csv(f'{self.lang}_articles_{"_".join(self.search_term.split(" "))}_{self.start_date}_{self.end_date}.pkl')


In [57]:
# Run these lines of code
eng = en_webscraper("vernacular school malaysia", "1/1/2015", "12/31/2015")
eng.search_google()

'NoneType' object is not iterable


In [3]:
class bm_webscraper(en_webscraper):
    """
    Extracts Malay article data from Google

    Attributes
    ----------
    search_term : str
        Term to search in Google
    start_date : str
        Date of earliest article in format MM/DD/YYYY
    end_date : str
        Date of latest article in format MM/DD/YYYY (default today) 
    """

    def __init__(self, search_term, start_date, end_date):
        super().__init__(search_term, start_date, end_date)
        # Set up selenium webdriver
        self.PATH = "C:\Program Files (x86)\chromedriver.exe" # Insert path to chromedriver
        self.options = Options()
        self.options.add_argument("--disable-notifications")

        search = "+".join(search_term.split(" ")) 
        start = "%2F".join(start_date.split("/")) 
        end = "%2F".join(end_date.split("/")) 
        self.URL = f'https://google.com/search?q={search}&tbm=nws&start=0&num=200&tbs=cdr%3A1%2Ccd_min%3A{start}%2Ccd_max%3A{end}'

        self.lang = 'bm'
        self.open_google()
        
    def open_google(self):
        """Open the Google browser first to click out of pop-ups"""

        self.driver = webdriver.Chrome(self.PATH, options = self.options)
        self.driver.get(self.URL)

    def search_google(self):
        """Go through all pages of search results and store relevant info"""

        # Get the page navigation bar element
        page_navigation = self.driver.find_element_by_class_name("AaVjTc")
        pages = page_navigation.find_elements_by_css_selector("td")
        if pages:
            page_numbers = [i for i, pg in enumerate(pages) if pg.text.isnumeric()]
        else:
            page_numbers = [1]

        # Get the links of all the pages of search results
        soup = []
        for pg in page_numbers:
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "main")))
            soup.append(BeautifulSoup(self.driver.page_source, "lxml"))
            if pg+1 <= len(page_numbers):
                pages[pg+1].click()
                page_navigation = self.driver.find_element_by_class_name("AaVjTc")
                pages = page_navigation.find_elements_by_css_selector("td")

        # Click through each page of search results
        for s in soup:
            link = []

            # Extract all the media article links on each page
            for item in s.find_all("a", class_="WlydOe"):
                link.append(item.get('href'))

            # Extract each media article's data        
            google_date = [item.getText() for item in s.find_all("p", class_="S1FAPd")]
            converted_date = [string for string in google_date]

            self.df = pd.concat([self.df, 
                pd.DataFrame({
                "title": [item.getText() for item in s.find_all('div', class_ = "mCBkyc JQe2Ld nDgy9d")],
                "media": [item.getText() for item in s.find_all("div", class_="CEMjEf")],
                "google_date": google_date,
                "converted_date": converted_date,
                #"page_date": [find_date(l) for l in link],
                "link": link,
                "article": "",
                "lang": "BM"
            })], ignore_index = True)

        self.driver.quit()
        self.get_articles()

    def get_articles(self):
        """Extract article text from each URL"""
        
        for i in self.df.index:
            self.driver = webdriver.Chrome(self.PATH, options = self.options)
            self.driver.set_page_load_timeout(20) # Terminate if page takes too long to load
            try:
                self.driver.get(self.df.link[i])
            except:
                self.driver.execute_script("window.stop();")
            
            # Find article text by p tags
            try:
                article = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "p"))
                )
                article = self.driver.find_elements_by_css_selector('p')
                text = [item.text for item in article]
                self.df.article[i] = " ".join(text)
            except:
                raise 
            
            self.driver.quit()
        
        self.export_df() # Save results as csv and pkl file

In [4]:
# Run this line first and click out of any pop ups from Google
bm = bm_webscraper("sekolah vernakular malaysia", "1/1/2017", "12/31/2017")

In [5]:
# Then run this
bm.search_google()

In [None]:
def combine_csv(folder):
    """Combine csv files into one file"""
    all_filenames = [f for f in os.listdir(folder) if f.endswith("csv")]
    combined_csv = pd.concat([pd.read_csv(folder+f) for f in all_filenames])
    combined_csv.to_csv("scraped_articles.csv", index=False, encoding='utf-8-sig')