## Scrape news tables by ticker from finviz

In [1]:
# Import libraries 
import pandas as pd
import os
import time

from datetime import datetime
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from urllib.parse import urlparse

from concurrent.futures import ThreadPoolExecutor
from loky import get_reusable_executor

In [2]:
table = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
sandp_df = table[0]

In [3]:
# function to read news table from finviz (use for process pool executor)
def finviz_news_table_process(ticker):
    start_time = time.perf_counter()
    
    pid = os.getpid()
    
    try:
        url = finviz_url + ticker
        req = Request(url=url, headers={'user-agent': 'my-app/0.0.1'})
        response = urlopen(req)
        html = BeautifulSoup(response)
        news_table = str(html.find(id='news-table'))
    except:
        news_table = None
        
    end_time = time.perf_counter()
    
    # Return [ticker, str_news_table, run_time, pid]
    return [ticker, news_table, end_time - start_time, pid]

In [4]:
# Process Pool Executor: read html from finviz for each ticker and save the news_table of each as a dataframe

start_test1 = time.perf_counter()

if __name__ == '__main__':
    finviz_url = 'https://finviz.com/quote.ashx?t='
    ticker_list = sandp_df['Symbol']
    # initiate executor
    executor = get_reusable_executor(max_workers=10, timeout=5)
    # apply executor to map finviz_news_table on ticker list 
    process_1 = executor.map(finviz_news_table_process, ticker_list)
    # save news tables as a dataframe (includes run time for each request)
    news_table_df = pd.DataFrame([[ticker, pid, run_time, str_news_table] for ticker, str_news_table, run_time, pid in process_1], columns=['ticker', 'pid', 'run_time', 'str_news_table'])
    print(news_table_df.head(10))
    

end_test1 = time.perf_counter()

print('Process Pool Executor finished in: ', end_test1 - start_test1, ' seconds')

  ticker    pid  run_time                                     str_news_table
0    MMM  22208  1.274536  <table border="0" cellpadding="1" cellspacing=...
1    ABT  18692  2.922247  <table border="0" cellpadding="1" cellspacing=...
2   ABBV  19920  3.225352  <table border="0" cellpadding="1" cellspacing=...
3   ABMD  19000  2.609542  <table border="0" cellpadding="1" cellspacing=...
4    ACN  21180  5.201069  <table border="0" cellpadding="1" cellspacing=...
5   ATVI  15716  5.127448  <table border="0" cellpadding="1" cellspacing=...
6   ADBE  18772  7.464577  <table border="0" cellpadding="1" cellspacing=...
7    AMD   4592  8.049512  <table border="0" cellpadding="1" cellspacing=...
8    AAP  22720  7.066407  <table border="0" cellpadding="1" cellspacing=...
9    AES  21468  6.079866  <table border="0" cellpadding="1" cellspacing=...
Process Pool Executor finished in:  200.4873933  seconds


## Scrape article contents from links on finviz

In [5]:
# define function to generate [date, time, headline, news_source, content, article_site] 
# input is a single article
def article_details(date_time, str_article):
    
    # convert str to html
    html_article = BeautifulSoup(str_article, 'html.parser')
    
    # Produce date and time
    date = date_time[0]
    time_ = date_time[1]
    
    # Produce headlines
    headline = html_article.a.get_text() 
    
    # Produce news source company
    news = html_article.span.get_text()
        
    # Produce news content
    # get link to the full article
    link = html_article.find('a').get('href')
    content = 'empty string'
    url_root = urlparse(link).netloc
    # check if link leads to yahoo.finance
    if url_root == 'finance.yahoo.com':
        try:
            # request from yahoo.finance
            req_art = Request(url=link, headers={'user-agent':'my-app/0.0.1'})            
            response_art = urlopen(req_art)
            html_art = BeautifulSoup(response_art)
            # get the article content
            content = str(html_art.find(class_='caas-body').get_text())
        except:
            print('Error following article link: ', link)
    
    # Return [date, time, headline, news_source, content, article_site] 
    return [date, time_, headline, news, content, url_root]

In [6]:
def date_to_list(html_article_list):
    date_time = []
    for html_article in html_article_list:
        date_scrape = html_article.td.text.split()
        if len(date_scrape) == 1:
            time_ = date_scrape[0]
        else:
            date = date_scrape[0]
            time_ = date_scrape[1]
        date_time.append([date, time_])
        
        
    return date_time

In [7]:
# Define function to prepare dataframe with [ticker, date, time, headline, news, content, article_site, run_time, req_time, wait_time, soup_time, cont_time]
# Thread Pool Processor

# input is a row [ticker, pid, run_time, str_news_table] from news_table_df
# return completed dataframe for 1 ticker (to be appended)
# also records time to process 1 ticker and saves in list 'time_per_ticker'

def ticker_to_dataframe_thr(row):
    
    ticker_time_start = time.perf_counter()
    
    
    # convert str_news_table to html format
    html_news_table = BeautifulSoup(row[3], 'html.parser')
    # split into list of articles in html format
    article_list = html_news_table.findAll('tr')
    # get date and time
    date_time_list = date_to_list(article_list)
    # convert all html to str
    article_list = [str(x) for x in article_list]
    
    # executor
    if __name__ == '__main__':
        executor = ThreadPoolExecutor(max_workers=10)
        thread_2 = executor.map(article_details, date_time_list, article_list)
        ticker_df = [[date, time_, news_source, headline, content, site] for date, time_, headline, news_source, content, site in thread_2]
        ticker_df = pd.DataFrame(ticker_df, columns=['date', 'time', 'news', 'headline', 'content', 'article_site'])
        
        
    ticker = row[0]
    ticker_col = pd.Series([ticker] * len(ticker_df))
    
    ticker_df.insert(0, 'ticker', ticker_col)
        
    
    ticker_time_end = time.perf_counter()
    time_per_ticker_1.append(ticker_time_end - ticker_time_start)

    print('Ticker complete: ', ticker)
    
    return ticker_df

In [8]:
# Define function to prepare dataframe with [ticker, date, time, headline, news, content, article_site, run_time, req_time, wait_time, soup_time, cont_time]
# Thread Pool Processor

# input is a row [ticker, pid, run_time, str_news_table] from news_table_df
# return completed dataframe for 1 ticker (to be appended)
# also records time to process 1 ticker and saves in list 'time_per_ticker'

def ticker_to_dataframe_thrv2(row):
    
    ticker_time_start = time.perf_counter()
    
    
    # convert str_news_table to html format
    html_news_table = BeautifulSoup(row[3], 'html.parser')
    # split into list of articles in html format
    article_list = html_news_table.findAll('tr')
    # get date and time
    date_time_list = date_to_list(article_list)
    # convert all html to str
    article_list = [str(x) for x in article_list]
    
    # executor
    if __name__ == '__main__':
        ticker_df = []
        executor = ThreadPoolExecutor(max_workers=10)
        thread_2 = executor.map(article_details, date_time_list, article_list)
        for i in thread_2:
            ticker_df.append(i)
        ticker_df = pd.DataFrame(ticker_df, columns=['date', 'time', 'news', 'headline', 'content', 'article_site'])
        
        
    ticker = row[0]
    ticker_col = pd.Series([ticker] * len(ticker_df))
    
    ticker_df.insert(0, 'ticker', ticker_col)
        
    
    ticker_time_end = time.perf_counter()
    time_per_ticker_1.append(ticker_time_end - ticker_time_start)

    print('Ticker complete: ', ticker)
    
    return ticker_df

## Testing Code

In [None]:
# test run map function

# define test df
test_news_table_df = news_table_df.copy()
test_news_table_df = test_news_table_df.iloc[0:5]
print('Testing input')
print(test_news_table_df)

time_per_ticker_1 = []

compile_start = time.perf_counter()

test_art_det_df = pd.DataFrame([])

test_news_table_list = test_news_table_df.values.tolist()
check_time_start = time.perf_counter()
mapper = list(map(ticker_to_dataframe_thr, test_news_table_list))
check_time_end = time.perf_counter()
for i in mapper:
    test_art_det_df = test_art_det_df.append(i)

test_art_det_df = test_art_det_df.reset_index()
print('Time to map: ', check_time_end - check_time_start)
    
compile_end = time.perf_counter()

print(test_art_det_df.head(10))
print('Time taken to compile 5 tickers is: ', compile_end - compile_start, ' seconds')
minutes = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) // 60 
seconds = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) % 60
print('Estimated time to compile all tickers is: ', minutes, 'minutes', seconds, 'seconds')

In [None]:
# test run 3 workers

# define test df
test_news_table_df = news_table_df.copy()
test_news_table_df = test_news_table_df.iloc[0:3]
print('Testing input')
print(test_news_table_df)

time_per_ticker_1 = []

compile_start = time.perf_counter()

if __name__ == '__main__':
    test_news_table_list = test_news_table_df.values.tolist()
    exe = get_reusable_executor(max_workers=3, timeout=5)
    print('Executor initiated.')
    process_3 = exe.map(ticker_to_dataframe_thr, test_news_table_list)
    process_3 = [[ticker, date, time_, news, headline, content, site] for ticker, date, time_, news, headline, content, site in process_3]
    print('Process 3 setup complete')
    test_art_det_df = []
    print('First dataframe converted')
    print(process_3[0])
    for i in process_3:
        test_art_det_df = test_art_det_df.append(i)

test_art_det_df = test_art_det_df.reset_index()

compile_end = time.perf_counter()

print(test_art_det_df.head(10))
print('Time taken to compile 5 tickers is: ', compile_end - compile_start, ' seconds')
minutes = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) // 60 
seconds = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) % 60
print('Estimated time to compile all tickers is: ', minutes, 'minutes', seconds, 'seconds')

In [None]:
# test run 10 workers

# define test df
test_news_table_df = news_table_df.copy()
test_news_table_df = test_news_table_df.iloc[0:10]
print('Testing input')
print(test_news_table_df)

time_per_ticker_1 = []

compile_start = time.perf_counter()

test_art_det_df = pd.DataFrame([])

if __name__ == '__main__':
    test_news_table_list = test_news_table_df.values.tolist()
    exe = get_reusable_executor(max_workers=10, timeout=5)
    process_3 = list(exe.map(ticker_to_dataframe_thr, test_news_table_list))
    for i in list_of_ticker_df:
        test_art_det_df = test_art_det_df.append(i)

compile_end = time.perf_counter()
    
print(test_art_det_df.head(10))
print('Time taken to compile 5 tickers is: ', compile_end - compile_start, ' seconds')
minutes = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) // 60 
seconds = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) % 60
print('Estimated time to compile all tickers is: ', minutes, 'minutes', seconds, 'seconds')

In [None]:
# test run 5 workers

from concurrent.futures import ThreadPoolExecutor
import time

# define test df
test_news_table_df = news_table_df.copy()
test_news_table_df = test_news_table_df.iloc[0:10]
print('Testing input')
print(test_news_table_df)

time_per_ticker_1 = []

compile_start = time.perf_counter()

test_art_det_df = pd.DataFrame([])

if __name__ == '__main__':
    test_news_table_list = test_news_table_df.values.tolist()
    exe = get_reusable_executor(max_workers=5, timeout=5)
    process_3 = exe.map(ticker_to_dataframe_thr, test_news_table_list)
    list_of_ticker_df = list(process_3)
    for i in list_of_ticker_df:
        test_art_det_df = test_art_det_df.append(i)

compile_end = time.perf_counter()
    
print(test_art_det_df.head(10))
print('Time taken to compile 5 tickers is: ', compile_end - compile_start, ' seconds')
minutes = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) // 60 
seconds = ((compile_end - compile_start) * len(news_table_df)/len(test_news_table_df) ) % 60
print('Estimated time to compile all tickers is: ', minutes, 'minutes', seconds, 'seconds')

In [33]:
def do_something_else(x):
    time.sleep(x/2)
    print([x-1, x, x+1])
    return [x-1, x, x+1]

def do_something(row):
    for x in row:
        do_something_else(x)
        
    if __name__ == '__main__':
        executor = ThreadPoolExecutor()
        thread = executor.map(do_something_else, row)
        df = pd.DataFrame([a for a in thread])
        for i in thread:
            print(i)
            #df = df.append(i)
        
    return df


sample = [[1, 1, 1, 1, 1], [3, 3, 3, 3, 3], [4, 4, 4, 4, 4], [1, 1, 1, 1, 1], [5, 5, 5, 5, 5]]

if __name__ == '__main__':
    exe = get_reusable_executor(max_workers=3, timeout=2)
    p = exe.map(do_something, sample)
    test_output = pd.DataFrame([])
    test_list = list(p)
    print('Here')
    for k in test_list:
        test_output = test_output.append(k)
    
        
    #test_output = [a for a in p]
    #test_df = pd.DataFrame([])
    #for x in test_output:
    #   test_df = test_df.append(test_output)

#test_output = test_output.append([1, 2, 3])
print(test_output)

Here
   0  1  2
0  0  1  2
1  0  1  2
2  0  1  2
3  0  1  2
4  0  1  2
0  2  3  4
1  2  3  4
2  2  3  4
3  2  3  4
4  2  3  4
0  3  4  5
1  3  4  5
2  3  4  5
3  3  4  5
4  3  4  5
0  0  1  2
1  0  1  2
2  0  1  2
3  0  1  2
4  0  1  2
0  4  5  6
1  4  5  6
2  4  5  6
3  4  5  6
4  4  5  6
