In [9]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from dateutil import parser as dparser
import urllib.parse
import timeit
import os

import numpy as np
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from util.corpus import Corpus
from util.config import config
from util.pyRanker import BM25
from util.web_query_v2 import web_query
from util.ticker import Ticker
from util.data_manager import data_manager
import ipywidgets as widgets

import dash
from dash import html
from dash import dcc
import plotly.graph_objects as go
import plotly.express as px

In [2]:
#pull api keys from the config file
cfig=config()
#setting data manager
datadir = os.path.join(os.getcwd(),"_data")
manager = data_manager(datadir=datadir)
corpus=Corpus()

In [13]:
#initiate ticker
tick = Ticker(config=cfig, t="GE", source='alpha')

In [14]:
d_start="9/1/2021"
desired_docs='max'
prime_query='ticker'

In [15]:

start = timeit.default_timer()


#Define Web_query object
wq=web_query(cfig)

#web_query_v2 for ticker object handling and multi-page api requests (when they work)
wq.query_all(tickerobj=tick, d_start=d_start, threaded=True, pages=1)

#compile results into a singular dataframe
wq.compile_results()
print("Returned Urls:",len(wq.results))

#scrap text from the results urls to form documents
print('Scraping Urls')
wq.scrape_results(threaded=True, max_docs=desired_docs)


#build corpus from web query results
#corpus=Corpus()
#store the web query data frame in the corpus for referencing urls and titles
corpus.set_results(wq.get_results())
#assign corpus documents as the web query documents and urls
corpus.set_corpus(documents = wq.documents, urls = wq.urls)

#Initiating Ranker
bm25 = BM25(norm='l2', smooth_idf=True, stopwords=corpus.stopwords, sublinear_tf=True)
bm25.fit(corpus.documents)

#Building Queries
corpus.build_queries(ticker=tick, prime=prime_query)
#Ranking and Pruning
corpus.rank_docs(ranker=bm25)
corpus.prune_docs()

#Sub-Dividing
print('Sub-Dividing Documents')
#corpus.sub_divide(tokenizer=tokenizer, cutoff=2, method='sen')
corpus.sub_divide(cutoff=2, method='sen')
print('Sub-docs:',len(corpus.sub_list))
print('Source count:', len(corpus.sub_docs))

#Ranking and Pruning
corpus.rank_subdocs(ranker=bm25)
corpus.prune_subdocs()

print("Number of sub_docs:",len(corpus.sub_list))
t = np.array(corpus.sub_list_scores)
z=np.where(t==0.0)
print("Number zero ranked:", len(z[0]))
print("percent of useless subdocs:", round((len(z[0])/len(corpus.sub_list))*100,2),"%")

#Relevant Set
corpus.make_relevant()
corpus.rank_relevant(ranker=bm25)
#shouldn't be any zeros because of the pruned subdocs, but cant hurt
corpus.prune_relevant(method='finite',cutoff=0.0)
print('relevant sources:',len(corpus.relevant_set))
print('relevant sub_docs:',len(corpus.rel_list))

#Sentiments
print('Getting Sentiments')
corpus.get_sentiments()

#Building and Storing Dataset
print('Storing Data')
corpus.data_preprocess()
corpus.build_fulldf()
corpus.build_pricedf(ticker=tick)
manager.store_data(ticker=tick.ticker, full_df=corpus.full_df, price_df=corpus.price_df)

stop = timeit.default_timer()
print('RunTime:',round((stop-start)/60,2),'mins')

Polygon: 375
NewsAPI: 4 page: 1
Usearch: 50 page: 1
Currents: 200 page: 1
Returned Urls: 605
Scraping Urls


IntProgress(value=0, max=605)

Sub-Dividing Documents


IntProgress(value=0, max=482)

Sub-docs: 3059
Source count: 351
Number of sub_docs: 3059
Number zero ranked: 2325
percent of useless subdocs: 76.01 %
relevant sources: 328
relevant sub_docs: 734
Getting Sentiments


IntProgress(value=0, max=734)

Storing Data




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



RunTime: 3.85 mins


In [16]:
manager.get_data_info(ticker=tickerlist[0])

{'full_df': {'mindate': Timestamp('2018-01-06 00:00:00'),
  'maxdate': Timestamp('2021-12-05 00:00:00'),
  'rows': 328},
 'price_df': {'mindate': Timestamp('2021-06-04 00:00:00'),
  'maxdate': Timestamp('2021-11-30 00:00:00'),
  'rows': 92}}

In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

price_df = corpus.price_df

fig = make_subplots(rows=3, cols=1, shared_xaxes=True,vertical_spacing=0.02, subplot_titles=(tick.ticker,'Sentiment_Score', 'Used News Volume'), 
               row_width=[0.4, 0.4, 1.5], specs=[[{"secondary_y": True}],[{"secondary_y": True}],[{"secondary_y": True}]])
fig.add_trace(go.Candlestick(x=price_df['pub_date'],
                open=price_df['open'], high=price_df['high'],
                low=price_df['low'], close=price_df['close'], name="Price"),
               secondary_y=True, row=1,col=1)

fig.add_trace(go.Bar(x=price_df['pub_date'], y=price_df['volume'],opacity=0.2, name='Volume'),secondary_y=False)
fig.add_trace(go.Line(x=price_df['pub_date'], y=price_df['scores'], showlegend=False), row=2, col=1)
#fig.add_trace(go.Line(x=price_df['pub_date'], y=price_df['sentiments'], showlegend=False), row=2, col=1)
#fig.add_trace(go.Line(x=price_df['pub_date'], y=price_df['relevance'], showlegend=False), row=2, col=1)
fig.add_trace(go.Bar(x=price_df['pub_date'],y=price_df['Doc_Volume'],opacity=0.2, name='Used News Volume'),row=3, col=1)
fig.update_layout(xaxis_rangeslider_visible=False, width=1000, height=750)
fig.layout.yaxis2.showgrid=False
fig.show()


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


