## Scrape news tables by ticker from finviz

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import os
import time

from datetime import datetime
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from urllib.parse import urlparse

from concurrent.futures import ThreadPoolExecutor
from loky import get_reusable_executor

In [2]:
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
sandp_df = table[0]

In [3]:
# function to read news table from finviz (use for process pool executor)
def finviz_news_table_process(ticker):
    start_time = time.perf_counter()
    
    pid = os.getpid()
    
    try:
        url = finviz_url + ticker
        req = Request(url=url, headers={'user-agent': 'my-app/0.0.1'})
        response = urlopen(req)
        html = BeautifulSoup(response)
        news_table = str(html.find(id='news-table'))
    except:
        news_table = None
        
    end_time = time.perf_counter()
    
    # Return [ticker, str_news_table, run_time, pid]
    return [ticker, news_table, end_time - start_time, pid]

In [4]:
# Process Pool Executor: read html from finviz for each ticker and save the news_table of each as a dataframe

start_test1 = time.perf_counter()

if __name__ == '__main__':
    finviz_url = 'https://finviz.com/quote.ashx?t='
    ticker_list = sandp_df['Symbol']
    # initiate executor
    executor = get_reusable_executor(max_workers=10, timeout=5)
    # apply executor to map finviz_news_table on ticker list 
    process_1 = executor.map(finviz_news_table_process, ticker_list)
    # save news tables as a dataframe (includes run time for each request)
    news_table_df = pd.DataFrame([[ticker, pid, run_time, str_news_table] for ticker, str_news_table, run_time, pid in process_1], columns=['ticker', 'pid', 'run_time', 'str_news_table'])
    print(news_table_df.head(10))
    

end_test1 = time.perf_counter()

print('Process Pool Executor finished in: ', end_test1 - start_test1, ' seconds')

  ticker    pid  run_time                                     str_news_table
0    MMM  14964  2.288448  <table border="0" cellpadding="1" cellspacing=...
1    ABT  16988  3.084255  <table border="0" cellpadding="1" cellspacing=...
2   ABBV   8264  2.243675  <table border="0" cellpadding="1" cellspacing=...
3   ABMD   7572  1.954463  <table border="0" cellpadding="1" cellspacing=...
4    ACN   4652  2.769137  <table border="0" cellpadding="1" cellspacing=...
5   ATVI   6940  3.098437  <table border="0" cellpadding="1" cellspacing=...
6   ADBE   4472  3.458757  <table border="0" cellpadding="1" cellspacing=...
7    AMD  15928  3.273517  <table border="0" cellpadding="1" cellspacing=...
8    AAP   8324  2.407815  <table border="0" cellpadding="1" cellspacing=...
9    AES   8052  2.696584  <table border="0" cellpadding="1" cellspacing=...
Process Pool Executor finished in:  100.13702599999999  seconds


## List each article with its attached date and time

In [5]:
def date_to_list(html_article_list):
    date_time = []
    for html_article in html_article_list:
        date_scrape = html_article.td.text.split()
        if len(date_scrape) == 1:
            time_ = date_scrape[0]
        else:
            date = date_scrape[0]
            time_ = date_scrape[1]
        date_time.append([date, time_])
        
        
    return date_time

In [6]:
def unpack_articles(row):
    
    html_news_table = BeautifulSoup(row[3], 'html.parser')
    
    article_list = html_news_table.findAll('tr')
    
    date_time_list = date_to_list(article_list)
    
    article_list = pd.Series([str(x) for x in article_list])
    
    ticker_list = pd.Series([row[0]] * len(article_list))
    
    articles_df = pd.DataFrame(date_time_list, columns=['date', 'time'])
    articles_df.insert(0, 'ticker', ticker_list)
    articles_df['raw_article'] = article_list
    
    
    return articles_df
    

In [7]:
# parse through every news table and generate a dataframe of each individual article

article_df = pd.DataFrame([])
for x in range(len(news_table_df)):
    if news_table_df.iloc[x][3] == None:
        continue
    else:
        new_ticker = unpack_articles(news_table_df.iloc[x])
        article_df = article_df.append(new_ticker)
    
article_df

Unnamed: 0,ticker,date,time,raw_article
0,MMM,Feb-19-21,08:11AM,"<tr><td align=""right"" style=""white-space:nowra..."
1,MMM,Feb-18-21,11:00AM,"<tr><td align=""right"" style=""white-space:nowra..."
2,MMM,Feb-17-21,09:37AM,"<tr><td align=""right"" style=""white-space:nowra..."
3,MMM,Feb-16-21,04:18PM,"<tr><td align=""right"" style=""white-space:nowra..."
4,MMM,Feb-16-21,03:58PM,"<tr><td align=""right"" width=""130"">03:58PM </t..."
...,...,...,...,...
95,ZTS,Jun-08-20,10:41AM,"<tr><td align=""right"" style=""white-space:nowra..."
96,ZTS,Jun-08-20,08:30AM,"<tr><td align=""right"" width=""130"">08:30AM </t..."
97,ZTS,Jun-05-20,11:32AM,"<tr><td align=""right"" style=""white-space:nowra..."
98,ZTS,Jun-05-20,08:14AM,"<tr><td align=""right"" width=""130"">08:14AM </t..."


## Compile details for each article and request content from related sites

In [8]:
def details(row):
    
    ticker, date, time_, str_article = row[0], row[1], row[2], row[3]
    
    html_article = BeautifulSoup(str_article, 'html.parser')
    
    # Produce news source company
    news = html_article.span.get_text()
        
    # Produce headlines
    headline = html_article.a.get_text()     
    
    # Produce news content
    # get link to the full article
    link = html_article.find('a').get('href')
    content = 'empty string'
    url_root = urlparse(link).netloc
    # check if link leads to yahoo.finance
    if url_root == 'finance.yahoo.com':
        try:
            # request from yahoo.finance
            req_art = Request(url=link, headers={'user-agent':'my-app/0.0.1'})            
            response_art = urlopen(req_art)
            html_art = BeautifulSoup(response_art)
            # get the article content
            content = str(html_art.find(class_='caas-body').get_text())
        except:
            print('Error following article link: ', link)
    
    return [ticker, date, time_, headline, news, content, url_root]

In [10]:
# scraping articles and saving them as csv in batches of 500

for i in range(0,100):
    article_list = article_df.iloc[(500*i) : (500*(1+i))]
    article_list = article_list.values.tolist()

    start = time.perf_counter()

    if __name__ == '__main__':
        executor = get_reusable_executor(max_workers=12, timeout=5)
        mapper = executor.map(details, article_list)
        raw_list = [x for x in mapper]

    raw_df = pd.DataFrame(np.array(raw_list), columns=['ticker', 'date', 'time', 'headline', 'news', 'content', 'site'])

    end = time.perf_counter()
    
    if i == 0:
        raw_df.to_csv('Raw Data.csv', index=False, header=True)
    else:
        previous_data = pd.read_csv('Raw Data.csv')
        raw_df = previous_data.append(raw_df)
        raw_df.to_csv('Raw Data.csv', index=False, header=True)
    
    print('Batch ' + str(i) + ' Complete in ' + str(end - start) + ' seconds' )


article_list = article_df.iloc[50000:].values.tolist()
if __name__ == '__main__':
    executor = get_reusable_executor(max_workers=12, timeout=5)
    mapper = executor.map(details, article_list)
    raw_list = [x for x in mapper]

raw_df = pd.DataFrame(np.array(raw_list), columns=['ticker', 'date', 'time', 'headline', 'news', 'content', 'site'])
previous_data = pd.read_csv('Raw Data.csv')
raw_df = previous_data.append(raw_df)
raw_df.to_csv('Raw Data.csv', index=False, header=True)
print('Scraping Complete')


Batch 0 Complete in 209.48304740000003 seconds
Batch 1 Complete in 216.72395820000008 seconds
Batch 2 Complete in 175.6453194999999 seconds
Batch 3 Complete in 191.37775190000002 seconds
Batch 4 Complete in 162.39360980000015 seconds
Batch 5 Complete in 165.9622233 seconds
Batch 6 Complete in 176.97866610000005 seconds
Batch 7 Complete in 174.39224519999993 seconds
Batch 8 Complete in 195.4282029000001 seconds
Batch 9 Complete in 142.94714840000006 seconds
Batch 10 Complete in 183.8978119999997 seconds
Batch 11 Complete in 196.0864762000001 seconds
Batch 12 Complete in 207.92768509999996 seconds
Batch 13 Complete in 154.0975487999999 seconds
Batch 14 Complete in 179.4179240999997 seconds
Batch 15 Complete in 186.14858189999995 seconds
Batch 16 Complete in 166.94575040000018 seconds
Batch 17 Complete in 186.51190799999995 seconds
Batch 18 Complete in 216.33673729999964 seconds
Batch 19 Complete in 150.06558930000028 seconds
Batch 20 Complete in 208.4596517 seconds
Batch 21 Complete in 1



Batch 37 Complete in 240.5990542999998 seconds
Batch 38 Complete in 218.5275142999999 seconds
Batch 39 Complete in 191.63415859999895 seconds
Batch 40 Complete in 195.23942180000086 seconds
Batch 41 Complete in 173.05244590000075 seconds
Batch 42 Complete in 201.80349509999905 seconds
Batch 43 Complete in 219.28495000000112 seconds
Batch 44 Complete in 242.01632860000063 seconds
Batch 45 Complete in 192.7729115000002 seconds
Batch 46 Complete in 195.91346010000052 seconds
Batch 47 Complete in 210.84015399999953 seconds
Batch 48 Complete in 180.7509159000001 seconds
Batch 49 Complete in 183.90850930000124 seconds
Batch 50 Complete in 183.92545020000034 seconds
Batch 51 Complete in 217.09037669999998 seconds
Batch 52 Complete in 164.74287820000063 seconds
Batch 53 Complete in 205.5943987999999 seconds
Batch 54 Complete in 173.34849589999976 seconds
Batch 55 Complete in 225.87813640000059 seconds
Batch 56 Complete in 214.59522430000106 seconds
Batch 57 Complete in 177.10558669999955 secon

## Split raw data into 3 csv files

In [6]:
import pandas as pd

raw_df = pd.read_csv('Raw Data.csv')
n_row = len(raw_df) // 3

raw_df1 = raw_df.iloc[:n_row]
raw_df2 = raw_df.iloc[n_row:(n_row*2)]
raw_df3 = raw_df.iloc[(n_row*2):]


In [7]:
raw_df1.to_csv('Data1.csv', index=False, header=True)
raw_df2.to_csv('Data2.csv', index=False, header=True)
raw_df3.to_csv('Data3.csv', index=False, header=True)