In [21]:
from bs4 import BeautifulSoup
from datetime import datetime
from dateutil import parser as dparser
import ipywidgets as widgets

import math
import numpy as np
import pandas as pd
import re
import requests

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
from scipy import sparse
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from transformers import pipeline

import urllib.parse
from util.config import config
from util.pyBM25 import BM25
from util.ticker import Ticker
from util.web_query import web_query

In [22]:
# Setting up our classification model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
classifier = pipeline(task='sentiment-analysis',model=model,tokenizer=tokenizer)
max_tokens = int(tokenizer.model_max_length)

In [23]:
#running through sub_docs and outputting sentiment list

def get_sentiments(docs):
    c = 0
    for x in docs.keys():
        c+=len(docs[x])
    
    pgres = widgets.IntProgress(value=0,min=0,max=c, step=1)
    display(pgres)
    
    sentiments = {}
    
    for x in docs.keys():
        scrs=[]
        for y in range(0, len(docs[x])):
            s = classifier(docs[x][y])
            scr = s[0]['score']
            if s[0]['label']=="NEGATIVE":
                scr=scr*-1
            scrs.append(scr)
            pgres.value+=1
            pgres.description=str(pgres.value)+":"+str(c)
        
        sentiments[x]=scrs
                
    return sentiments

In [24]:
def normalize(input_matrix):
    """
    Normalizes the rows of a 2d input_matrix so they sum to 1
    """

    row_sums = input_matrix.sum(axis=1)
    try:
        assert (np.count_nonzero(row_sums)==np.shape(row_sums)[0]) # no row should sum to zero
    except Exception:
        raise Exception("Error while normalizing. Row(s) sum to zero")
    new_matrix = input_matrix / row_sums[:, np.newaxis]
    return new_matrix

In [25]:
class Corpus(object):
    
    #******************************************************************************
    #----------------------------------Method---------------------------------------
    #******************************************************************************
    
    #Run the web_query to produce a collection of text documents scraped from the web
    
    #Use the set_results() function to store the full results in the corpus for processing
    
    #Use the set_corpus() function to assign the documents scraped from the web to the corpus
    
    #Sub divide the documents into smaller sub_docs
    
    #Rank the documents based on relevance to the original query as well as any tags
    
    #Prune the sub_docs to produce a relevant set
    
    
    #******************************************************************************
    #******************************************************************************
    
    def __init__(self):
        #typical corpus data
        self.documents = []
        self.vocabulary = []
        self.number_of_documents = 0
        self.vocabulary_size = 0
        
        #plsa and liklihoods
        self.likelihoods = []
        self.term_doc_matrix = None 
        self.document_topic_prob = None  # P(z | d)
        self.topic_word_prob = None  # P(w | z)
        self.topic_prob = None  # P(z | d, w)
                
        #instance fields for web results
        self.query_results=None
        self.max_tokens=512
        self.failed = []
        
        #sub dividing documents
        self.tokenizer=None
        self.sub_docs=None
        
        #relevance scores
        self.document_scores=None
        self.document_tag_scores=None
        self.subdoc_scores=None
        self.subdoc_tag_scores=None
        self.title_scores=None
                
        #pruned data
        self.relevant_set=None
        self.relevant_scores=None
    
    
    #******************************************************************************
    #------------------------------Setting Corpus----------------------------------
    #******************************************************************************
    
    def set_results(self, df):
        #dataframe returned from webquery
        self.query_results=df
    
    def set_corpus(self, documents):
        self.documents = documents
        
    def build_corpus_from_url(self, max_docs=50):
        #scrape text from url-list to build corpus
        #(not recommended, use the same method from the web_query object and the set_corpus() method)
        
        url_list = self.query_results['url'].tolist()
        url_list = url_list[0:max_docs]
        
        pgres = widgets.IntProgress(value=0,min=0,max=len(url_list), step=1)
        display(pgres)
        
        failed=[]
        headers = {"User-Agent":"Mozilla/5.0"}
        for i in range(0,len(url_list)):
            try:
                response = requests.get(url=url_list[i],headers=headers)
                if response.status_code==200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    d = soup.get_text()
                    if len(d)>200:
                        self.documents.append(d)
                else:
                    self.failed.append(i)
            except:
                self.failed.append(i)

            finally:
                pgres.value+=1
                pgres.description=str(i+1)+":"+str(len(url_list))
                
        self.number_of_documents=len(self.documents)
        #remove failed url responses from dataset
        self.query_results = self.query_results.take(list(set(range(self.query_results.shape[0]))-set(self.failed)))
        
    # Creates corpus & corresponding docs from inputted file
    def build_corpus_from_file(self, file_path):
        f = open(file_path, 'r')
        docs = f.readlines()
        for d in docs:
            self.documents.append(d)
        self.number_of_documents = len(docs)

     # Augments the classifiers vocabulary   
    def build_vocabulary(self, stopwords):
        v = set([])
        for x in self.documents:
            tmp = set(x.split())
            tmp = {x for x in tmp if x.lower() not in stopwords}
                        
            v.update(tmp)
        
        v = list(v)
        self.vocabulary = v
        self.vocabulary_size = len(v)
        
             
    
    #******************************************************************************
    #------------------------------Sub Dividing-------------------------------------
    #******************************************************************************
    
    def get_pgraphs(self, doc, cutoff, method):
        #updated get_pgraphs() with method for cutoff
        #cut off method:
        #sen: number of sentences
        #word: number of words  

        pgraphs=[]
        freshsoup = re.split('\n\n',doc)
        for x in range(0,len(freshsoup)):
            if method=='word':
                words = len(str(freshsoup[x]).strip().split(' ',maxsplit=cutoff))
                if words>cutoff:
                    pgraphs.append(freshsoup[x])
                    
            elif method=='sen':
                sens = len(re.findall("\.",str(freshsoup[x]).strip()))
                if sens>cutoff:
                    pgraphs.append(freshsoup[x])
                    
        return pgraphs
    
    def split_doc(self, doc, subs):         
        if len(re.findall(r'\.', doc))>1:
            cut_point = doc.rfind('.', 0, int(len(doc)/2))+1
        else:
            cut_point = int(len(doc)/2)

        d1 = doc[0:cut_point]
        d2 = doc[cut_point+1:]

        tkns1 = int(len(self.tokenizer(d1)['input_ids']))

        if tkns1>self.max_tokens:
            self.split_doc(d1,subs)
        else:
            if len(d1)>0:
                subs.append(d1)

        tkns2 = int(len(self.tokenizer(d2)['input_ids']))

        if tkns2>self.max_tokens:
            self.split_doc(d2, subs)
        else:
            if len(d2)>0:
                subs.append(d2)
            
    
    def get_subdocs(self, pgraphs):
        #Updated get_subdocs with iterative slicing 
        #ensure sub_docs tokens will not exceed max_tokens for sentiment model
        sub_docs=[]

        for x in range(0, len(pgraphs)):
            sen_cnt = len(re.split('\n|\. ',pgraphs[x]))
            tkns = int(len(tokenizer(pgraphs[x])['input_ids']))

            if tkns<self.max_tokens:
                sub_docs.append(pgraphs[x])
            else:
                self.split_doc(pgraphs[x],sub_docs)
        
        return sub_docs
        
    def sub_divide(self, tokenizer, cutoff=1, method='sen'):
        #creates a dictionary of sub_docs divided from each document in the corpus
        #method: using get_pgraphs() followed by get_subdocs()
        #output form: dict{ document_id : [subdoc_1, subdoc_2 ... subdoc_n] }

        subbed_data = {}
        self.tokenizer=tokenizer

        for x in range(0, len(self.documents)):

            pg = self.get_pgraphs(self.documents[x], cutoff, method)
            subs = self.get_subdocs(pg)
            subbed_data[x]=subs

        self.sub_docs = subbed_data

        
    #******************************************************************************
    #----------------------------------Relevance Scoring---------------------------
    #******************************************************************************  
    
    def rank_docs(self, query, ranker):
        self.document_scores = ranker.score(query, self.documents)
        
    def rank_doc_tags(self, tags, ranker):
        tag_scores=[]
        for t in tags:
            scores = ranker.score(t, self.documents)
            tag_scores.append(scores)
            
        self.document_tag_scores = tag_scores
        
    def rank_subdocs(self, query, ranker):
        sub_vecs={}
        for x in self.sub_docs.keys():
            sub_vec = ranker.score(query, self.sub_docs[x])
            sub_vecs[x]=sub_vec
            
        self.subdoc_scores = sub_vecs
    
    def rank_subdocs_tags(self, tags, ranker):
        tag_scores=[]
        for t in tags:
            sub_vecs={}
            for x in self.sub_docs.keys():
                sub_vec = ranker.score(t, self.sub_docs[x])
                sub_vecs[x]=sub_vec
            tag_scores.append(sub_vecs)
        
        self.subdoc_tag_scores = tag_scores
    
    def rank_titles(self, name, ranker):
        name = re.sub('(,|\.|Inc| )',"",str(name))
        titles = self.query_results['title'].tolist()
        self.title_scores = ranker.score(name, titles)
        
    def rank_ticker(self, ticker, ranker):
        #Takes a stock ticker object and runs all of the rankers above
        
        name = ticker.name
        sym = ticker.ticker
        tags = ticker.tags
        
        self.rank_docs(name,ranker)
        self.rank_doc_tags(tags, ranker)
        self.rank_subdocs(name,ranker)
        self.rank_subdocs_tags(tags,ranker)
        self.rank_titles(name,ranker)
        
    #******************************************************************************
    #----------------------------Pruning Relevant Set------------------------------
    #******************************************************************************
    
    def prune_subdocs(self, cutoff=0.4):
        subbed_data = self.sub_docs
        sub_scores = self.subdoc_scores
        for x in self.sub_docs.keys():

            subbed_data[x] = [xv if c else None for c, xv in zip(sub_scores[x]>cutoff, subbed_data[x])]
            subbed_data[x] = [y for y in subbed_data[x] if y!=None]
            sub_scores[x] = [y for y in sub_scores[x] if y>cutoff]
        
        self.relevant_set = {k: v for k, v in subbed_data.items() if len(v) > 0}
        self.relevant_scores={k: v for k, v in sub_scores.items() if len(v) > 0}
    
    #******************************************************************************
    #-------------------------------------PLSA (from MP3)--------------------------
    #******************************************************************************
    
    # Use MP3 as a reference
    def build_term_doc_matrix(self):
        m = []
        line = []
        for x in self.documents:
            doc = list(x.split())
            for itm in self.vocabulary:
                line.append(x.count(itm))
            m.append(line)
            line = []
        self.term_doc_matrix = np.array(m)
        
    def initialize_prob(self, number_of_topics):
        self.document_topic_prob = np.random.random_sample((self.number_of_documents, number_of_topics))
        self.document_topic_prob = normalize(self.document_topic_prob)

        self.topic_word_prob = np.random.random_sample((number_of_topics, len(self.vocabulary)))
        self.topic_word_prob = normalize(self.topic_word_prob)

            
    def E_step(self):
        for x in range(0,self.term_doc_matrix.shape[0]):  #loop through documents
            e = self.document_topic_prob[x].reshape(-1,1)*self.topic_word_prob
            self.topic_prob[x] = normalize(e)
           

    def M_step(self, number_of_topics):
        pz = []
        for x in range(0, self.term_doc_matrix.shape[0]):         
            m = self.topic_prob[x]*self.term_doc_matrix[x].reshape(1,-1)
            self.document_topic_prob[x] = np.sum(m,axis=1)
            pz.append(m)
        
        pz = np.array(pz)
        self.topic_word_prob = np.sum(pz,axis=0)
        
        self.document_topic_prob = normalize(self.document_topic_prob)
        self.topic_word_prob = normalize(self.topic_word_prob)
 

    def calculate_likelihood(self, number_of_topics):
        l = np.log(np.prod(np.power(np.dot(self.document_topic_prob,self.topic_word_prob),self.term_doc_matrix),axis=1))
        l = l[np.argmax(l)]
        self.likelihoods.append(l)
        

    def plsa(self, number_of_topics, max_iter, epsilon):
        self.build_term_doc_matrix()
        self.topic_prob = np.zeros([self.number_of_documents, number_of_topics, self.vocabulary_size], dtype=np.float)
        self.initialize_prob(number_of_topics)
        current_likelihood = 0.0

        for iteration in range(max_iter):
            self.E_step()
            self.M_step(number_of_topics)
            
            l = self.calculate_likelihood(number_of_topics)
            
            if current_likelihood==0 or current_likelihood==None or l>current_likelihood:
                current_likelihood = l
            else:
                break


In [26]:
#pull api keys from the config file
cfig=config()

In [27]:
#create a ticker object 
tick = Ticker(cfig, "LMT",source='yahoo')

In [28]:
wq=web_query(cfig)

In [29]:
testq=tick.name
d_start="11/1/2021"
#query all of the news apis in web_query object
wq.query_all(query=tick.name, ticker=tick.ticker, d_start=d_start)
#compile results into a singular dataframe
wq.compile_results()
#scrap text from the results urls to form documents
wq.scrape_results(threaded=True, max_docs=200)

IntProgress(value=0, max=200)

In [None]:
df = wq.get_results()

In [None]:
#build corpus from web query results
corpus=Corpus()
#store the web query data frame in the corpus for referencing urls and titles
corpus.set_results(df)
#assign corpus documents as the web query documents
corpus.set_corpus(wq.documents)

In [None]:
#pull in stop words and build corpus vocabulary 
stopwords=[]
with open('util/stopwords.txt') as f:
    stopwords.append(f.read().splitlines())
stopwords=stopwords[0]

corpus.build_vocabulary(stopwords)

print("Vocabulary size:" + str(len(corpus.vocabulary)))
print("Number of documents:" + str(len(corpus.documents)))

Vocabulary size:22570
Number of documents:162


In [None]:
#build BM25 ranker fit to the corpus vocabulary
bm25 = BM25(norm='l2', smooth_idf=True, stopwords=stopwords, sublinear_tf=True, vocabulary=corpus.vocabulary)
bm25.fit(corpus.documents)

In [None]:
#create the sub_documents, wrapper to run multiple functions
#passing in the tokenizer to save a little on class dependencies
corpus.sub_divide(tokenizer=tokenizer, cutoff=2, method='sen')
print('Sub-docs:',len(corpus.sub_docs))

Token indices sequence length is longer than the specified maximum sequence length for this model (711 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 162


In [None]:
#pass in ticker object and use the BM25 ranker to do a collection of ranking
#this is the same as running each commented function below one by one

corpus.rank_ticker(tick,bm25)

# corpus.rank_docs(tick.name, bm25)
# corpus.rank_doc_tags(tick.tags, bm25)
# corpus.rank_subdocs(tick.name, bm25)
# corpus.rank_subdocs_tags(tick.tags, bm25)
# corpus.rank_titles(tick.name,bm25)

In [None]:
#create 'relevant set' by pruning the sub_doc based on a cutoff value for the ranker score
corpus.prune_subdocs(cutoff=0.2)

In [None]:
relevant_set = corpus.relevant_set
relevant_scores = corpus.relevant_scores

In [None]:
print(len(relevant_set))
print(len(relevant_scores))

94
94


In [None]:
print(tick.name)

Lockheed Martin Corporation


In [None]:
print(relevant_scores[5])
print(relevant_set[5][5])

[0.28445012768236494]


IndexError: list index out of range

In [None]:
#run the new relevant set through distilled-BERT and get sentiment classifications
sentiments = get_sentiments(relevant_set)

IntProgress(value=0, max=143)

In [None]:
sentiments[5]

[-0.6789803504943848,
 -0.9776577949523926,
 -0.9890533089637756,
 0.566616952419281,
 0.6002805233001709,
 -0.9922448992729187]

In [None]:
lens=[]
for x in relevant_set.keys():
    for y in range(0, len(relevant_set[x])):
        lens.append(len(relevant_set[x][y]))

In [None]:
avlen=np.mean(lens)

In [None]:
# Accumulate all relevance weights
len_weight={}
for x in relevant_set.keys():
    l=[]
    for y in range(0, len(relevant_set[x])):
        l.append(len(relevant_set[x][y])/avlen)
    len_weight[x]=l

In [None]:
# Adjust relevance weight based on length
adjusted_rel = {}
for x in relevant_scores.keys():
    l=[]
    for y in range(0, len(relevant_scores[x])):
        l.append(relevant_scores[x][y] * len_weight[x][y])
    adjusted_rel[x]=l

In [None]:
# for x in relevant_set.keys():
#     print(x, "relevance:", relevant_scores[x])
#     print("  adjusted r:", adjusted_rel[x])
#     print("  sentiments:", sentiments[x])
    

In [None]:
rw_scores=[]
lrw_scores=[]
for x in relevant_scores.keys():
    for y in range(0, len(relevant_scores[x])):
        rw_scores.append(relevant_scores[x][y] * sentiments[x][y])
        lrw_scores.append(adjusted_rel[x][y] * sentiments[x][y])

In [None]:
print("Average Relevance weighted Sentiment:", np.mean(rw_scores).round(4))
print("Average Length adjusted Relevance weighted sentiment:", np.mean(lrw_scores).round(4))


Average Relevance weighted Sentiment: -0.2161
Average Length adjusted Relevance weighted sentiment: -0.2239
