# Scraping headlines - final

For scraping headlines I was using Selenium. For this purpose a chromedriver should be installed. The scraped headlines are stored in a Mongodb document.

The archive articles are available only for subscribed users, that is why I not suppose to share niether the headlines, urls, or the articles. However this notebook contains all the necessary codes how to go trough the process.

## 1. Preparations

### Import libraries and create useful instances

In [8]:
import requests
from selenium import webdriver
import json
import random
import time

In [9]:
# generated random sleeptimes
sleep_time = np.random.normal(loc=15, scale=5, size=100).round(3)
sleep_time = sleep_time[sleep_time > 5]  # the sleep time average is 15 seconds

### Creating a Mongodb document to store the articles

In [17]:
import pymongo

# create a Mongodb document
myclient = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
news = myclient['news']

# my collections:
tesla_news = news['tesla_news']
ge_news = news['ge_news']
ibm_news = news['ibm_news']
goldman_news = news['goldman_news']
ford_news = news['ford_news']

In [16]:
news.create_collection()

Collection(Database(MongoClient(host=['127.0.0.1:27017'], document_class=dict, tz_aware=False, connect=True), 'news'), 'ford_news')

### OOP - Class and methods for retrieving headlines and summary texts

In [11]:
class WebNavigator:
    '''
    This class initiates a chrome webdriver.
    '''

    def __init__(self):
        self.driver = webdriver.Chrome('/Users/flatironschool/chromedriver')
        # if there is a modification, the version num helps to track it down
        print('Version2')

    def login(self, username, password, login_url):
        '''This method allows you to login to the targeted website.'''

        self.driver.get(login_url)

        u = self.driver.find_element_by_id("username")
        p = self.driver.find_element_by_id("password")

        time.sleep(2.2)
        u.send_keys(username)
        p.send_keys(password)

        time.sleep(1.8)
        login_attempt = self.driver.find_element_by_class_name("sign-in")
        login_attempt.submit()

        time.sleep(np.random.choice(sleep_time, 1).item()/2)  # being polite

        elements = self.driver.find_elements_by_tag_name(
            'a')  # we must imitate a click
        for e in elements:
            if e.text in ['SIGN IN', 'Sign In']:
                e.click()
                break

    def navigate(self, url):
        self.driver.get(url)

    def get_headlines(self, first_page, collection, num_of_pages=1):
        '''
        This method retrievs and saves the headlines and summeries to a MongoDB collection.
        -------------------------------------------------------
        Inputs:
            first_page: url of the first page of the search result
            num_of_pages: the number of pages of the search results
            collection: name of the Mongodb collection where to save the scraped headlines
        -------------------------------------------------------
        Returns:
            there is no return, everything is saved directly to Mongodb collection
        '''

        # create the search urls from the first urls page and the number of pages
        if num_of_pages > 1:
            search_urls = [first_page] + [first_page + f'&page={n}' 
                                          for n in range(2, num_of_pages+1)]
        else:
            search_urls = [first_page]

        for url in search_urls:
            self.url = url
            self.driver.get(url)
            # being polite avg 15 sec
            time.sleep(np.random.choice(sleep_time, 1).item())
            headline_containers = self.driver.find_elements_by_class_name(
                'headline-container')

            for hc in headline_containers:
                # -------------topic---------------
                try:
                    topic = hc.find_element_by_class_name(
                        'category').find_element_by_tag_name('a').text
                except:
                    topic = 'nan'
                # ------------headline-------------
                try:
                    headline = hc.find_element_by_class_name(
                        'headline').find_element_by_tag_name('a').text
                except:
                    continue
                # --------------summary-------------
                try:
                    summary = hc.find_element_by_class_name(
                        'summary-container').find_element_by_tag_name('p').text
                except:
                    summary = 'nan'
                # ---------------date---------------
                try:
                    date = hc.find_element_by_tag_name('time').text
                except:
                    continue
                # ---------------url-----------------
                try:
                    url = hc.find_element_by_class_name(
                        'headline').find_element_by_tag_name('a').get_attribute('href')
                except:
                    url = 'nan'

                # ---------insert document into collection-------
                document = {'headline': headline,
                            'summary': summary,
                            'topic': topic,
                            'date': date,
                            'url': url}
                collection.insert_one(document)

    def close(self):
        '''This method closes the window.'''
        self.driver.close()

## 2. Getting the headline/summary text from the search results



The advanced search was generated manually on the website, where there was possible to identify the first result page and the number of result pages altogether. Each search result page contains 20 headlines, its topic category, date, short summary and url.

The "search_result" variable contains the first page of the search result. 

#### Tesla:

In [42]:
with open("assets/webpage_user.json", 'r') as f:
    search_result = json.load(f)['tesla_search_results']

webpage = WebNavigator()
webpage.login(my_username, my_password, login_url)
try:
    webpage.get_headlines(search_result, tesla_news, num_of_pages=144)
except:
    print('error: ', webpage.url)
webpage.close()

Version2


#### GE:

In [50]:
with open("assets/webpage_user.json", 'r') as f:
    search_result = json.load(f)['ge_search_results']

webpage = WebNavigator()
webpage.login(my_username, my_password, login_url)
try:
    webpage.get_headlines(search_result, ge_news, num_of_pages=146)
except:
    print('error: ', webpage.url)
webpage.close()

Version2


Comment: where the time is given as "x hours ago", time should be changed to the day of scrape

#### IBM:

In [12]:
with open("assets/webpage_user.json", 'r') as f:
    search_result = json.load(f)['ibm_search_results']

webpage = WebNavigator()
webpage.login(my_username, my_password, login_url)
try:
    webpage.get_headlines(search_result, ibm_news, num_of_pages=101)
except:
    print('error: ', webpage.url)
webpage.close()

Version2


#### Ford:

In [18]:
with open("assets/webpage_user.json", 'r') as f:
    search_result = json.load(f)['ford_search_results']

webpage = WebNavigator()
webpage.login(my_username, my_password, login_url)
try:
    webpage.get_headlines(search_result, ford_news, num_of_pages=515)
except:
    print('error: ', webpage.url)
webpage.close()

Version2


## 3. Check the number of documents in Mongodb

In [44]:
tesla_news.estimated_document_count()

2735

In [51]:
ge_news.estimated_document_count()

2838

In [52]:
goldman_news.estimated_document_count()

1285

In [14]:
ibm_news.estimated_document_count()

1983