In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from dateutil import parser as dparser
import urllib.parse

import numpy as np
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from scipy import sparse

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from transformers import pipeline

from util.config import config
from util.pyBM25 import BM25
import ipywidgets as widgets

In [296]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
classifier = pipeline(task='sentiment-analysis',model=model,tokenizer=tokenizer)
max_tokens = int(tokenizer.model_max_length)

In [232]:
config=config()

In [235]:
class Ticker(object):
    
    def __init__(self,config, t):
        
        #Get general information about company from ticker symbol via api
        #name, ticker, industry, sector, tags
        
        t = t.upper()
        
        #Try to get ticker information from Polygon.io
        url = ('https://api.polygon.io/v1/meta/symbols/'+t+'/company?apikey='+config.polygon)
        response = requests.get(url)
        
        if response.status_code==200:
            response = response.json()
            self.name = response['name']
            self.tags = response['tags']
            self.industry = response['industry']
            self.sector = response['sector']
            self.ticker = response['symbol']
        else:
            #Try to get ticker information from yahoofinance
            print("Bad reponse from Polygon.io: Status_code:"+str(response.status_code))
            print("Trying Yahoo Api")

            yheaders = {'x-api-key':config.yahoo}
            url = ('https://yfapi.net/v11/finance/quoteSummary/'+t+'?lang=en&region=US&modules=assetProfile')
            url2 = ('https://yfapi.net/v6/finance/quote?symbols='+t)
            response1 = requests.request("GET",url,headers=yheaders).json()
            response2 = requests.request("GET",url2,headers=yheaders).json()
            
            self.sector = response1['quoteSummary'].get('result')[0].get('assetProfile').get('sector')
            self.industry = response1['quoteSummary'].get('result')[0].get('assetProfile').get('industry')
            self.name = response2['quoteResponse'].get('result')[0].get('longName')
            self.ticker = response2['quoteResponse'].get('result')[0].get('symbol')
            t1 = re.split('\n|\.|&|,|and',self.sector)
            t2 = re.split('\n|\.|&|,|and',self.industry)
            tags=[]
            for i in t1:
                tags.append(i.strip())
            for i in t2:
                tags.append(i.strip())
            self.tags=tags
        

In [31]:
def query_Usearch(config, query, d_start='Now', d_end='Now', page=1, pageSize=50):
    
    query = urllib.parse.quote_plus(query)
    #Valid format : Date format should be YYYY-MM-ddTHH:mm:ss.ss±hh:mm
    if d_end=='Now':
        d_end=datetime.now()
    else:
        d_end = dparser.parse(d_end)
    d_end_str = d_end.strftime("%Y-%m-%d")
        
    if d_start=='Now':
        d_start=datetime.now()
    else:
        d_start = dparser.parse(d_start)
    d_start_str = d_start.strftime("%Y-%m-%d")
    
    url = "https://contextualwebsearch-websearch-v1.p.rapidapi.com/api/search/NewsSearchAPI"
    querystring = {"q":query,
                   "pageNumber":"1",
                   "pageSize":"150",
                   "autoCorrect":"false",
                   "fromPublishedDate":d_start,
                   "toPublishedDate":d_end}

    headers = {
        'x-rapidapi-host': str(config.usearch_host),
        'x-rapidapi-key': str(config.usearch_key)
        }

    response = requests.request("GET", url, headers=headers, params=querystring)
    
    #Check that reponse is valid
    if response.status_code==200:
        response = response.json()
        
        #Loop through response to create return Dataframe: columns: Titles, Urls, Publication Dates
        datalist=[]
        for x in response['value']:
            row = {'title':str(x['title']), 'url':x['url'], 'pub_date':x['datePublished']}
            datalist.append(row)
        df = pd.DataFrame.from_dict(datalist)
        return df
    else:
        return 0      

In [5]:
def query_currents(config, query, d_start='Now', d_end='Now', page=1, pageSize=200):
    
    #6-Month archive
    
    query = urllib.parse.quote_plus(query)
    
    #Valid format : Date format should be YYYY-MM-ddTHH:mm:ss.ss±hh:mm
    if d_end=='Now':
        d_end=datetime.now()
    else:
        d_end = dparser.parse(d_end)
    d_end_str = d_end.strftime("%Y-%m-%d")
        
    if d_start=='Now':
        d_start=datetime.now()
    else:
        d_start = dparser.parse(d_start)
    d_start_str = d_start.strftime("%Y-%m-%d")

    
    url = ('https://api.currentsapi.services/v1/search?'
           '&start_date='+d_start_str+
           '&end_date='+d_end_str+
           '&keywords='+query+
           '&language=en'
           '&country=us'
           '&page_number='+str(page)+
           '&page_size='+str(pageSize)+
           '&apiKey='+str(config.currents))
    
    #Check that reponse is valid
    response = requests.get(url)
    if response.status_code==200:
        response = response.json()
        
        #Loop through response to create return Dataframe: columns: Titles, Urls, Publication Dates
        datalist=[]
        for x in response['news']:
            row = {'title':str(x['title']), 'url':x['url'], 'pub_date':x['published']}
            datalist.append(row)
        df = pd.DataFrame.from_dict(datalist)
        return df
    else:
        return 0      

In [14]:
def query_polygon(config, ticker, d_start='Now', d_end='Now', pageSize=200):
    
    ticker = ticker.upper()
    
    #Valid format : Date format should be YYYY-MM-ddTHH:mm:ss.ss±hh:mm
    if d_end=='Now':
        d_end=datetime.now()
    else:
        d_end = dparser.parse(d_end)
    d_end_str = d_end.strftime("%Y-%m-%d")
        
    if d_start=='Now':
        d_start=datetime.now()
    else:
        d_start = dparser.parse(d_start)
    d_start_str = d_start.strftime("%Y-%m-%d")
    
    
    url = ('https://api.polygon.io/v2/reference/news?'
      'ticker='+ticker+
      '&published_utc/gte='+d_start_str+
      '&published_utc/lte='+d_end_str+
      '&limit='+str(pageSize)+
      '&sort=published_utc'
      '&apikey='+str(config.polygon))
    
    #Check that reponse is valid
    response = requests.get(url)
    if response.status_code==200:
        response = response.json()
        
        #Loop through response to create return Dataframe: columns: Titles, Urls, Publication Dates
        datalist=[]
        for x in response['results']:
            row = {'title':str(x['title']), 'url':x['article_url'], 'pub_date':x['published_utc']}
            datalist.append(row)
        df = pd.DataFrame.from_dict(datalist)
        return df
    else:
        return 0     
        

In [22]:
def query_newsapi(config, query, d_start='Now', d_end='Now', domains="", exclude="", page=1, pageSize=100):
    
    #1 Month archive
    
    query = urllib.parse.quote_plus(query)
    
    #Valid format : Date format should be YYYY-MM-ddTHH:mm:ss.ss±hh:mm
    if d_end=='Now':
        d_end=datetime.now()
    else:
        d_end = dparser.parse(d_end)
    d_end_str = d_end.strftime("%Y-%m-%d")
        
    if d_start=='Now':
        d_start=datetime.now()
    else:
        d_start = dparser.parse(d_start)
    d_start_str = d_start.strftime("%Y-%m-%d")
    
    url = ('https://newsapi.org/v2/everything?'
      'q='+query+
      '&domains='+domains+
      '&excludeDomains='+exclude+
      '&from='+d_start_str+
      '&to='+d_end_str+
      '&language=en'
      '&sortBy=publishedAt'
      '&pageSize='+str(pageSize)+
      '&page='+str(page)+  
      '&apikey='+str(config.newsapi))
    
    #Check that reponse is valid
    response = requests.get(url)
    if response.status_code==200:
        response = response.json()
        
        #Loop through response to create return Dataframe: columns: Titles, Urls, Publication Dates    
        datalist=[]
        for x in response['articles']:
            row = {'title':str(x['title']), 'url':x['url'], 'pub_date':x['publishedAt']}
            datalist.append(row)
        df = pd.DataFrame.from_dict(datalist)
        return df
    else:
        return 0    
    

In [251]:
tick = Ticker(config,'PLUG')
print(tick.name)

Plug Power Inc.


In [18]:
#Test Query
testq="PLUG"
d_start="11/1/2021"

In [9]:
#Running query through Usearch API
usearch_df = query_Usearch(config=config, query=testq, d_start=d_start)

In [11]:
#Running query through Currents API
currents_df = query_currents(config=config, query=testq, d_start=d_start)

In [16]:
#Running query through Polygon.io
polygon_df = query_polygon(config=config, ticker=testq, d_start=d_start)

In [23]:
#Running query through (google)NewsAPI
newsapi_df = query_newsapi(config=config, query=testq, d_start=d_start)

In [208]:
#combine results into singular dataframe
#Remove duplicate urls and article titles
frames = [usearch_df, currents_df, polygon_df, newsapi_df]
total_df = pd.concat(frames)
total_df = total_df.drop_duplicates(subset=['url'])
total_df = total_df.drop_duplicates(subset=['title'])

In [209]:
total_df

Unnamed: 0,title,url,pub_date
0,Floundering smart meter rollout bids to plug i...,https://www.telegraph.co.uk/business/2021/11/1...,2021-11-13T12:00:00
1,"Rover looks at EV rates for charging, gas tax ...",https://www.cadillacnews.com/news/rover-looks-...,2021-11-13T07:00:00
2,Plug in,https://www.wenatcheeworld.com/news/plug-in/ar...,2021-11-12T18:11:00
3,Amazon Smart Plug review: Everything should be...,https://www.androidcentral.com/amazon-smart-pl...,2021-11-12T16:00:00
4,France: Plug-In Market Share Reaches 22.9% In ...,https://insideevs.com/news/547155/france-plugi...,2021-11-12T13:30:16
...,...,...,...
94,Officials worry China’s men’s hockey team not ...,https://www.denverpost.com/2021/11/12/beijing-...,2021-11-13T00:43:43Z
95,Magazine Luiza SA (MGLUY) CEO Frederico Trajan...,https://seekingalpha.com/article/4469095-magaz...,2021-11-13T00:41:05Z
96,PowerSchool Holdings Inc. (PWSC) CEO Hardeep G...,https://seekingalpha.com/article/4469092-power...,2021-11-13T00:34:01Z
97,CLEC Fashion Festival Reimagines Fashion As An...,https://www.forbes.com/sites/stephanrabimov/20...,2021-11-13T00:18:58Z


In [321]:
#Can take a little while, added loading bar

def build_corpus_from_url(url_list):
    pgres = widgets.IntProgress(value=0,min=0,max=len(url_list), step=1)
    display(pgres)
    corpus = []
    failed=[]
    headers = {"User-Agent":"Mozilla/5.0"}
    for i in range(0,len(url_list)):
        try:
            response = requests.get(url=url_list[i],headers=headers)
            if response.status_code==200:
                soup = BeautifulSoup(response.content, 'html.parser')
                corpus.append(soup.get_text())
            else:
                #print("failed:",i,url_list[i])
                failed.append(i)
        except:
            #print("failed:",i,url_list[i])
            failed.append(i)
            
        finally:
            pgres.value+=1
            pgres.description=str(i+1)+":"+str(len(url_list))
    return [corpus,failed]

In [322]:
test_cutoff = 100

In [323]:
urls=total_df['url'].tolist()

In [324]:
corpus = build_corpus_from_url(urls[0:test_cutoff])

IntProgress(value=0)

In [254]:
#creating test dataframe of from full dataframe
test_df=total_df[0:test_cutoff]

#removing the failed url requests 
#->returned from build_corpus_from_url()
test_df = test_df.take(list(set(range(test_df.shape[0]))-set(corpus[1])))


In [255]:
print(len(corpus[0]))
print(len(test_df))

92
92


In [20]:
stopwords=[]
with open('util/stopwords.txt') as f:
    stopwords.append(f.read().splitlines())
stopwords=stopwords[0]

In [30]:
test_str="This is a Test string for the purpose of removing stopwords"

In [31]:
tmp = set(test_str.split())
print(tmp)

{'stopwords', 'for', 'is', 'purpose', 'This', 'removing', 'the', 'Test', 'string', 'of', 'a'}


In [32]:
t = {x for x in tmp if x.lower() not in stopwords}
print(t)

{'stopwords', 'purpose', 'string', 'removing', 'Test'}


In [257]:
#Fitting BM25 ranker to full Corpus
#Getting preliminary BM25 scores for full web scrape
q = tick.name
bm25 = BM25(stopwords=stopwords[0])
bm25.fit(corpus[0])
result_vec = bm25.transform(q, corpus[0])
print(result_vec)



[0.98547195 0.34162697 0.29910872 1.43847351 0.59897335 1.84991387
 0.51436637 1.22435046 1.59984558 0.3533325  0.53195264 1.32095828
 0.33871382 1.52009905 0.95021817 0.50973101 0.         0.28548806
 0.         1.41492307 0.58077216 2.02190232 0.55133982 1.56727157
 0.59561628 1.16509876 0.42234909 1.4997267  1.50403793 1.79830238
 1.79917616 1.81698373 0.5134271  0.57604022 0.36593254 0.54200275
 1.92171497 1.33591207 0.583196   1.65621447 2.0413675  1.08885524
 2.003347   0.         0.29686605 0.3200048  0.48163607 0.37196921
 1.67166616 0.20524894 0.9029974  1.12995467 0.52258901 0.
 1.80096127 0.         1.92329135 1.14053577 0.40521273 0.
 1.16515589 0.23793685 1.90204236 0.         0.         0.3985973
 0.         0.         1.86684439 0.         2.04855942 0.75992987
 0.         1.90942036 1.73114042 1.01668499 1.97837085 1.15816666
 0.         0.         2.02616796 0.69472286 0.34276685 1.99608785
 0.50059476 0.         1.21981665 1.00901508 1.29109599 1.49460143
 1.18593543 

In [258]:
test_df['url_bm25']=result_vec

In [259]:
#loop through tags (retrieved or created by Ticker class) associated with ticker and add tag-bm25 score to df
#Add results into dataframe

for x in range(0,len(tick.tags)):
    q = tick.tags[x]
    result_vec = bm25.transform(q, corpus[0])
    col_str = "tag"+str(x)+"_bm25"
    test_df[col_str]=result_vec

In [260]:
#Dropping instances where the whole document's bm25 score was 0
test_df = test_df[test_df['url_bm25']>0]
test_df

Unnamed: 0,title,url,pub_date,url_bm25,tag0_bm25,tag1_bm25,tag2_bm25
0,Floundering smart meter rollout bids to plug i...,https://www.telegraph.co.uk/business/2021/11/1...,2021-11-13T12:00:00,0.985472,0.000000,0.00000,0.000000
1,"Rover looks at EV rates for charging, gas tax ...",https://www.cadillacnews.com/news/rover-looks-...,2021-11-13T07:00:00,0.341627,0.000000,0.00000,2.212997
2,Plug in,https://www.wenatcheeworld.com/news/plug-in/ar...,2021-11-12T18:11:00,0.299109,0.000000,0.00000,0.000000
3,Amazon Smart Plug review: Everything should be...,https://www.androidcentral.com/amazon-smart-pl...,2021-11-12T16:00:00,1.438474,0.000000,0.00000,5.220796
4,France: Plug-In Market Share Reaches 22.9% In ...,https://insideevs.com/news/547155/france-plugi...,2021-11-12T13:30:16,0.598973,0.000000,0.00000,0.000000
...,...,...,...,...,...,...,...
57,Save Foods Inc. Announces Expansion of Commerc...,https://www.benzinga.com/news/21/11/23988067/s...,2021-11-09 16:43:49 +0000,1.009015,0.000000,0.00000,0.000000
60,Karcher EWM 2,https://www.trustedreviews.com/reviews/karcher...,2021-11-09 13:25:52 +0000,1.291096,1.521352,0.00000,2.150439
61,SilverStone HELA 2050 Power Supply is Hella Po...,https://www.techpowerup.com/288795/silverstone...,2021-11-09 07:09:02 +0000,1.494601,0.761077,0.00000,5.745432
62,(PR) KIOXIA XD6 EDSFF E1.S Form-Factor Enterpr...,https://www.techpowerup.com/288789/kioxia-xd6-...,2021-11-09 04:09:21 +0000,1.185935,1.638326,2.71468,6.313200


In [305]:
#updated get_pgraphs() with method for cutoff

def get_pgraphs(doc, cutoff=7, method='word'):
    
    #cut off method:
    #sen: number of sentences
    #word: number of words  
    
    pgraphs=[]
    freshsoup = re.split('\n\n',doc)
    for x in range(0,len(freshsoup)):
        if method=='word':
            words = len(str(freshsoup[x]).strip().split(' ',maxsplit=cutoff))
            if words>cutoff:
                pgraphs.append(freshsoup[x])
        elif method=='sen':
            sens = len(re.findall("\.",str(freshsoup[x]).strip()))
            if sens>cutoff:
                pgraphs.append(freshsoup[x])
    
    return pgraphs

In [312]:
#testing get_prgraphs
doc = corpus[0][0]
pgs = get_pgraphs(doc, cutoff=1, method='sen')
print("Number of parsed sections:",len(pgs))
print(pgs[0])

In [264]:
#Updated get_subdocs to ensure sub_docs tokens will not exceed max

def get_subdocs(pgraphs, max_tokens):
    sub_docs=[]

    for x in range(0, len(pgraphs)):
        sen_cnt = len(re.split('\n|\. ',pgraphs[x]))
        tkns = int(len(tokenizer(pgraphs[x])['input_ids']))
        
        if tkns>=max_tokens:
            
            pg = pgraphs[x]
            slices=0
            
            while True:
                #cut in half, count tokens
                slices+=1
                cut_point = pg.rfind(".",0,int(len(pg)/2))+1
                cut_tkns = int(len(tokenizer(pg[0:cut_point])['input_ids']))    
            
                if cut_tkns<max_tokens:
                    break
                else:
                    #trim pg and recut, counting slices
                    pg = pg[0:cut_point]
                    
            #loop through pgraph[x] using multiples of cutpoint to slice
            #append subdoc at each slice
            for i in range(0, (slices*2)):
                pg = pgraphs[x][(cut_point*(i)):(cut_point*(i+1))]
                sub_docs.append(pg)
                
        else:
            sub_docs.append(pgraphs[x])
        

    
    return sub_docs

In [313]:
#testing get_subdocs
subs = get_subdocs(pgs, max_tokens)
len(subs)

In [325]:
#running through sub_docs and outputting sentiment list

def get_sentiments(sub_docs):
    sentiments = []
    for x in range(0,len(sub_docs)):
        #print(len(tokenizer(sub_docs[x])['input_ids']))
        s = classifier(sub_docs[x])
        scr = s[0]['score']
        if s[0]['label']=="NEGATIVE":
            scr=scr*-1
        sentiments.append(scr)
    return sentiments

In [326]:
sents = get_sentiments(subs)
print(sents)

[0.8531283736228943, -0.9415674209594727, -0.9753469228744507, 0.631061851978302]
