In [2]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from dateutil import parser as dparser
import urllib.parse
import timeit
import os

import numpy as np
import pandas as pd
import math

from util.corpus import Corpus
from util.config import config
from util.pyRanker import BM25
from util.web_query_v2 import web_query
from util.ticker import Ticker
from util.data_manager import data_manager
import ipywidgets as widgets


#pull api keys from the config file
cfig=config()
#setting data manager
datadir = os.path.join(os.getcwd(),"_data")
manager = data_manager(datadir=datadir)
#initiating a corpus object
corpus=Corpus()

# Creating data
- This notebook serves as an example of the data creation process used by the model. 
- You can define a ticker symbol to look up and get data on from the selected apis you have set up
- You can also see the parameters for the search
- At the bottom of the notebook, there is also an example of the data_manager storing and pulling in a dataset to graph :D

## Setting Up

The cell below contains adjustable parameters to be used by the example data creation pipeline. 

In [19]:
#Setting up query
#date to start search
d_start="9/1/2021"

#date to end search, can also be "Now"
#d_end="11/15/2021"
d_end='Now'

#number of urls to scrape from results, max will scrape all returned urls
desired_docs='max'
#identifies the type of prime query to use. this directly effects how the documents are ranked by the pyRanker
#options are [name, ticker, both]
prime_query='ticker'

#page size of the api requests
pagesizes = {'usearch':50, 'poly':700, 'currents':200 ,'newsapi':100}
#apis to use when querying for the news
apis = ['poly', 'currents', 'newsapi']
#number of pages to try to get from these apis
pages=2

In [13]:
#initiate ticker

#Ticker symbol to create ticker object on
ticker_symbol = "CAT"

tick = Ticker(config=cfig, t=ticker_symbol)
print(tick.name)
print(tick.name_adj)

Caterpillar Inc
Caterpillar


## Example: Data Creation Pipeline
Running the cell below will do all of the following in sequence:

- query the selected apis to return scraped news data
- setup the corpus with the scraped news
- setup the ranker from util.pyRanker
- build the queries to be used in ranking
- create relevant info by subdividing the documents into sections and pruning them down into a relevant_set
- run the relevant_set through our fine-tuned sequence classifier to get sentiments
- building the datasets that can be stored by the datamanager for graphing

Please note that this can take time depending on the parameters you have specified and how many urls are returned

In [21]:

start = timeit.default_timer()

#Define Web_query object
print('API requests')
wq=web_query(config=cfig, apis=apis, pagesizes=pagesizes)

#web_query_v2 for ticker object handling and multi-page api requests (when they work)
wq.query_all(tickerobj=tick, d_start=d_start, d_end=d_end, threaded=True, pages=pages)

#compile results into a singular dataframe
wq.compile_results()
print("Returned Urls:",len(wq.results))

#scrap text from the results urls to form documents
print('Scraping Urls')
wq.scrape_results(threaded=True, max_docs=desired_docs)


#build corpus from web query results
#corpus=Corpus()
#store the web query data frame in the corpus for referencing urls and titles
results = wq.get_results()
corpus.set_results(results)
#assign corpus documents as the web query documents and urls
corpus.set_corpus(documents = wq.documents, urls = wq.urls)

#Initiating Ranker
bm25 = BM25(norm='l2', smooth_idf=True, stopwords=corpus.stopwords, sublinear_tf=True)
bm25.fit(corpus.documents)

#Building Queries
corpus.build_queries(ticker=tick, prime=prime_query)
#Ranking and Pruning
corpus.rank_docs(ranker=bm25)
corpus.prune_docs()

#Sub-Dividing
print('Sub-Dividing Documents')
#corpus.sub_divide(tokenizer=tokenizer, cutoff=2, method='sen')
corpus.sub_divide(cutoff=2, method='sen')
print('Sub-docs:',len(corpus.sub_list))
print('Source count:', len(corpus.sub_docs))

#Ranking and Pruning
corpus.rank_subdocs(ranker=bm25)
corpus.prune_subdocs()

print("Number of sub_docs:",len(corpus.sub_list))
t = np.array(corpus.sub_list_scores)
z=np.where(t==0.0)
print("Number zero ranked:", len(z[0]))
print("percent of useless subdocs:", round((len(z[0])/len(corpus.sub_list))*100,2),"%")

#Relevant Set
corpus.make_relevant()
corpus.rank_relevant(ranker=bm25)
#shouldn't be any zeros because of the pruned subdocs, but cant hurt
corpus.prune_relevant(method='finite',cutoff=0.0)
print('relevant sources:',len(corpus.relevant_set))
print('relevant sub_docs:',len(corpus.rel_list))

#Sentiments
print('Getting Sentiments')
corpus.get_sentiments()

#Building Dataset
print('Storing Data')
corpus.data_preprocess()
corpus.build_fulldf()
corpus.build_pricedf(ticker=tick)

stop = timeit.default_timer()
print('RunTime:',round((stop-start)/60,2),'mins')

API requests
Polygon: 515
NewsAPI: 100 page: 1
error on Newsapi page: 2
Currents: 200 page: 2
Returned Urls: 746
Scraping Urls


IntProgress(value=0, max=746)

Sub-Dividing Documents


IntProgress(value=0, max=685)

Token indices sequence length is longer than the specified maximum sequence length for this model (1039 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 6005
Source count: 593
Number of sub_docs: 6005
Number zero ranked: 4973
percent of useless subdocs: 82.81 %
relevant sources: 572
relevant sub_docs: 1032
Getting Sentiments


IntProgress(value=0, max=1032)

Storing Data




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



RunTime: 7.19 mins


# Storing and Graphing

In the below cells, the data_manager is called to store the dataframes created by the corpus.

The dataframe will be stored locally in the _data directory of the repo and can be called back by the manager to graph

In [9]:
manager.store_data(ticker=tick.ticker, full_df=corpus.full_df, price_df=corpus.price_df)
manager.get_ticker_list()
manager.get_data_info(ticker=tick.ticker)

NameError: name 'corpus' is not defined

In [14]:
price_df = manager.get_pricedf(ticker=tick.ticker)
price_df = price_df.sort_values(by='pub_date')

In [15]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=3, cols=1, shared_xaxes=True,vertical_spacing=0.02, subplot_titles=(tick.ticker,'Sentiment_Score', 'Used News Volume'), 
               row_width=[0.4, 0.6, 1.5], specs=[[{"secondary_y": True}],[{"secondary_y": True}],[{"secondary_y": True}]])
fig.add_trace(go.Candlestick(x=price_df['pub_date'],
                open=price_df['open'], high=price_df['high'],
                low=price_df['low'], close=price_df['close'], name="Price"),
               secondary_y=True, row=1,col=1)

fig.add_trace(go.Bar(x=price_df['pub_date'], y=price_df['volume'],opacity=0.2, name='Volume'),secondary_y=False)
fig.add_trace(go.Line(x=price_df['pub_date'], y=price_df['scores'], showlegend=False, line_color='purple'), row=2, col=1)
fig.add_trace(go.Bar(x=price_df['pub_date'],y=price_df['Doc_Volume'],opacity=0.2, name='Used News Volume'),row=3, col=1)

fig.add_hline(y=0, opacity=0.8, line_width=1, row=2, col=1)
fig.add_hline(y=1, opacity=0.5, line_dash='dash',line_color='green', row=2, col=1)
fig.add_hline(y=-1, opacity=0.5, line_dash='dash',line_color='red',row=2,col=1)

fig.update_layout(xaxis_rangeslider_visible=False, width=1000, height=750)
fig.layout.yaxis2.showgrid=False
fig.show()