In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from dateutil import parser as dparser
import urllib.parse

import numpy as np
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from transformers import pipeline

from util.config import config
from util.pyBM25 import BM25
from util.pyRanker2 import BM25 as BM25_QE
from util.web_query import web_query
from util.ticker import Ticker
import ipywidgets as widgets

In [2]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
classifier = pipeline(task='sentiment-analysis',model=model,tokenizer=tokenizer)
max_tokens = int(tokenizer.model_max_length)

In [3]:
#running through sub_docs and outputting sentiment list

def get_sentiments(docs):
    
    c = 0
    for x in docs.keys():
        c+=len(docs[x])
    
    pgres = widgets.IntProgress(value=0,min=0,max=c, step=1)
    display(pgres)
    
    sentiments = {}
    
    for x in docs.keys():
        scrs=[]
        for y in range(0, len(docs[x])):
            
            s = classifier(docs[x][y])
            scr = s[0]['score']
            if s[0]['label']=="NEGATIVE":
                scr=scr*-1
            scrs.append(scr)
            pgres.value+=1
            pgres.description=str(pgres.value)+":"+str(c)
        
        sentiments[x]=scrs
                
    return sentiments

In [4]:
def normalize(input_matrix):
    """
    Normalizes the rows of a 2d input_matrix so they sum to 1
    """

    row_sums = input_matrix.sum(axis=1)
    try:
        assert (np.count_nonzero(row_sums)==np.shape(row_sums)[0]) # no row should sum to zero
    except Exception:
        raise Exception("Error while normalizing. Row(s) sum to zero")
    new_matrix = input_matrix / row_sums[:, np.newaxis]
    return new_matrix

In [5]:
class Corpus(object):
    
    #******************************************************************************
    #----------------------------------Method---------------------------------------
    #******************************************************************************
    
    #Run the web_query to produce a collection of text documents scraped from the web
    
    #Use the set_results() function to store the full results in the corpus for processing
    
    #Use the set_corpus() function to assign the documents scraped from the web to the corpus
    
    #Sub divide the documents into smaller sub_docs
    
    #Rank the documents based on relevance to the original query as well as any tags
    
    #Prune the sub_docs to produce a relevant set
    
    #
    
    #******************************************************************************
    #******************************************************************************
    
    def __init__(self):
        
        #typical corpus data
        self.documents = []
        self.vocabulary = []
        self.number_of_documents = 0
        self.vocabulary_size = 0
        
        #plsa and liklihoods
        self.likelihoods = []
        self.term_doc_matrix = None 
        self.document_topic_prob = None  # P(z | d)
        self.topic_word_prob = None  # P(w | z)
        self.topic_prob = None  # P(z | d, w)
                
        #for web results
        self.query_results=None
        self.max_tokens=512
        self.failed = []
        
        #sub dividing documents
        self.tokenizer=None
        self.sub_docs=None
        
        #relevance scores
        self.document_scores=None
        self.document_tag_scores=None
        self.subdoc_scores=None
        self.subdoc_tag_scores=None
        self.title_scores=None
                
        #pruned data
        self.relevant_set=None
        self.relevant_scores=None
    
    
    #******************************************************************************
    #------------------------------Setting Corpus----------------------------------
    #******************************************************************************
    
    def set_results(self, df):
        #dataframe returned from webquery
        self.query_results=df
    
    def set_corpus(self, documents):
        self.documents = documents
        
    def build_corpus_from_url(self, max_docs=50):
        
        #scrape text from url-list to build corpus
        #(not recommended, use the same method from the web_query object and the set_corpus() method)
        
        url_list = self.query_results['url'].tolist()
        url_list = url_list[0:max_docs]
        
        pgres = widgets.IntProgress(value=0,min=0,max=len(url_list), step=1)
        display(pgres)
        
        failed=[]
        headers = {"User-Agent":"Mozilla/5.0"}
        for i in range(0,len(url_list)):
            try:
                response = requests.get(url=url_list[i],headers=headers)
                if response.status_code==200:
                    soup = BeautifulSoup(response.content, 'html.parser')
                    d = soup.get_text()
                    if len(d)>200:
                        self.documents.append(d)
                else:
                    self.failed.append(i)
            except:
                self.failed.append(i)

            finally:
                pgres.value+=1
                pgres.description=str(i+1)+":"+str(len(url_list))
                
        self.number_of_documents=len(self.documents)
        #remove failed url responses from dataset
        self.query_results = self.query_results.take(list(set(range(self.query_results.shape[0]))-set(self.failed)))
        

    def build_corpus_from_file(self, file_path):

        f = open(file_path, 'r')
        docs = f.readlines()
        for d in docs:
            self.documents.append(d)
        self.number_of_documents = len(docs)

        
    def build_vocabulary(self, stopwords):

        v = set([])
        for x in self.documents:
            tmp = set(x.split())
            tmp = {x for x in tmp if x.lower() not in stopwords}
                        
            v.update(tmp)
        
        v = list(v)
        self.vocabulary = v
        self.vocabulary_size = len(v)
        
             
    
    #******************************************************************************
    #------------------------------Sub Dividing-------------------------------------
    #******************************************************************************
    
    def get_pgraphs(self, doc, cutoff, method):
        #updated get_pgraphs() with method for cutoff
        #cut off method:
        #sen: number of sentences
        #word: number of words  

        pgraphs=[]
        freshsoup = re.split('\n\n',doc)
        for x in range(0,len(freshsoup)):
            if method=='word':
                words = len(str(freshsoup[x]).strip().split(' ',maxsplit=cutoff))
                if words>cutoff:
                    pgraphs.append(freshsoup[x])
                    
            elif method=='sen':
                sens = len(re.findall("\.",str(freshsoup[x]).strip()))
                if sens>cutoff:
                    pgraphs.append(freshsoup[x])
                    
        return pgraphs
    
    def split_doc(self, doc, subs):         
            
        if len(re.findall(r'\.', doc))>1:
            cut_point = doc.rfind('.', 0, int(len(doc)/2))+1
        else:
            cut_point = int(len(doc)/2)

        d1 = doc[0:cut_point]
        d2 = doc[cut_point+1:]

        tkns1 = int(len(self.tokenizer(d1)['input_ids']))

        if tkns1>self.max_tokens:
            self.split_doc(d1,subs)
        else:
            if len(d1)>0:
                subs.append(d1)

        tkns2 = int(len(self.tokenizer(d2)['input_ids']))

        if tkns2>self.max_tokens:
            self.split_doc(d2, subs)
        else:
            if len(d2)>0:
                subs.append(d2)
            
    
    def get_subdocs(self, pgraphs):
        #Updated get_subdocs with iterative slicing 
        #ensure sub_docs tokens will not exceed max_tokens for sentiment model
        sub_docs=[]

        for x in range(0, len(pgraphs)):
            sen_cnt = len(re.split('\n|\. ',pgraphs[x]))
            tkns = int(len(tokenizer(pgraphs[x])['input_ids']))

            if tkns<self.max_tokens:
                sub_docs.append(pgraphs[x])
            else:
                self.split_doc(pgraphs[x],sub_docs)
        
        return sub_docs
        
    def sub_divide(self, tokenizer, cutoff=1, method='sen'):

        #creates a dictionary of sub_docs divided from each document in the corpus
        #method: using get_pgraphs() followed by get_subdocs()
        #output form: dict{ document_id : [subdoc_1, subdoc_2 ... subdoc_n] }

        subbed_data = {}
        self.tokenizer=tokenizer

        for x in range(0, len(self.documents)):

            pg = self.get_pgraphs(self.documents[x], cutoff, method)
            subs = self.get_subdocs(pg)
            subbed_data[x]=subs

        self.sub_docs = subbed_data

        
    #******************************************************************************
    #----------------------------------Relevance Scoring---------------------------
    #******************************************************************************  
    
    def rank_docs(self, query, ranker):
        self.document_scores = ranker.score(query, self.documents)
        
    def rank_doc_tags(self, tags, ranker):
        tag_scores=[]
        for t in tags:
            scores = ranker.score(t, self.documents)
            tag_scores.append(scores)
            
        self.document_tag_scores = tag_scores
        
    def rank_subdocs(self, query, ranker):
        sub_vecs={}
        for x in self.sub_docs.keys():
            sub_vec = ranker.score(query, self.sub_docs[x])
            sub_vecs[x]=sub_vec
            
        self.subdoc_scores = sub_vecs
    
    def rank_subdocs_tags(self, tags, ranker):
        
        tag_scores=[]
        for t in tags:
            sub_vecs={}
            for x in self.sub_docs.keys():
                sub_vec = ranker.score(t, self.sub_docs[x])
                sub_vecs[x]=sub_vec
            tag_scores.append(sub_vecs)
        
        self.subdoc_tag_scores = tag_scores
    
    def rank_titles(self, name, ranker):
        name = re.sub('(,|\.|Inc| )',"",str(name))
        titles = self.query_results['title'].tolist()
        self.title_scores = ranker.score(name, titles)
        
    def rank_ticker(self, ticker, ranker):
        
        #Takes a ticker object and runs all of the rankers above
        
        name = ticker.name
        sym = ticker.ticker
        tags = ticker.tags
        
        self.rank_docs(name,ranker)
        self.rank_doc_tags(tags, ranker)
        self.rank_subdocs(name,ranker)
        self.rank_subdocs_tags(tags,ranker)
        self.rank_titles(name,ranker)
        
    #******************************************************************************
    #----------------------------Pruning Relevant Set------------------------------
    #******************************************************************************
    
    def prune_subdocs(self, cutoff=0.4):
        subbed_data = self.sub_docs
        sub_scores = self.subdoc_scores
        for x in self.sub_docs.keys():

            subbed_data[x] = [xv if c else None for c, xv in zip(sub_scores[x]>cutoff, subbed_data[x])]
            subbed_data[x] = [y for y in subbed_data[x] if y!=None]
            sub_scores[x] = [y for y in sub_scores[x] if y>cutoff]
        
        self.relevant_set = {k: v for k, v in subbed_data.items() if len(v) > 0}
        self.relevant_scores={k: v for k, v in sub_scores.items() if len(v) > 0}
    
    #******************************************************************************
    #-------------------------------------PLSA (from MP3)--------------------------
    #******************************************************************************
    
    def build_term_doc_matrix(self):
        
        m = []
        line = []
        for x in self.documents:
            doc = list(x.split())
            for itm in self.vocabulary:
                line.append(x.count(itm))
            m.append(line)
            line = []
        self.term_doc_matrix = np.array(m)
        
    def initialize_prob(self, number_of_topics):

        self.document_topic_prob = np.random.random_sample((self.number_of_documents, number_of_topics))
        self.document_topic_prob = normalize(self.document_topic_prob)

        self.topic_word_prob = np.random.random_sample((number_of_topics, len(self.vocabulary)))
        self.topic_word_prob = normalize(self.topic_word_prob)

            
    def E_step(self):
        
        for x in range(0,self.term_doc_matrix.shape[0]):  #loop through documents
            e = self.document_topic_prob[x].reshape(-1,1)*self.topic_word_prob
            self.topic_prob[x] = normalize(e)
           

    def M_step(self, number_of_topics):
        
        pz = []
        for x in range(0, self.term_doc_matrix.shape[0]):         
            m = self.topic_prob[x]*self.term_doc_matrix[x].reshape(1,-1)
            self.document_topic_prob[x] = np.sum(m,axis=1)
            pz.append(m)

        #update
        
        pz = np.array(pz)
        self.topic_word_prob = np.sum(pz,axis=0)
        
        self.document_topic_prob = normalize(self.document_topic_prob)
        self.topic_word_prob = normalize(self.topic_word_prob)
 

    def calculate_likelihood(self, number_of_topics):

        l = np.log(np.prod(np.power(np.dot(self.document_topic_prob,self.topic_word_prob),self.term_doc_matrix),axis=1))
        l = l[np.argmax(l)]
        self.likelihoods.append(l)
        

    def plsa(self, number_of_topics, max_iter, epsilon):

        self.build_term_doc_matrix()
        self.topic_prob = np.zeros([self.number_of_documents, number_of_topics, self.vocabulary_size], dtype=np.float)
        self.initialize_prob(number_of_topics)
        current_likelihood = 0.0

        for iteration in range(max_iter):
            self.E_step()
            self.M_step(number_of_topics)
            
            l = self.calculate_likelihood(number_of_topics)
            
            if current_likelihood==0 or current_likelihood==None or l>current_likelihood:
                current_likelihood = l
            else:
                break


In [6]:
#pull api keys from the config file
cfig=config()

In [7]:
#create a ticker object 
tick = Ticker(cfig, "AAPL",source='yahoo')

In [8]:
wq=web_query(cfig)

In [9]:
testq=tick.name
d_start="11/1/2021"
#query all of the news apis in web_query object
wq.query_all(query=tick.name, ticker=tick.ticker, d_start=d_start)
#compile results into a singular dataframe
wq.compile_results()
#scrap text from the results urls to form documents
wq.scrape_results(threaded=True, max_docs=200)

IntProgress(value=0, max=200)

In [10]:
df = wq.get_results()

In [11]:
#build corpus from web query results
corpus=Corpus()
#store the web query data frame in the corpus for referencing urls and titles
corpus.set_results(df)
#assign corpus documents as the web query documents
corpus.set_corpus(wq.documents)

In [12]:
#pull in stop words and build corpus vocabulary 
stopwords=[]
with open('util/stopwords.txt') as f:
    stopwords.append(f.read().splitlines())
stopwords=stopwords[0]

corpus.build_vocabulary(stopwords)

print("Vocabulary size:" + str(len(corpus.vocabulary)))
print("Number of documents:" + str(len(corpus.documents)))

Vocabulary size:27380
Number of documents:189


In [13]:
print(tick.tags)

['Technology', 'Consumer Electronics']


In [14]:
bm25q = BM25_QE(norm='l2', smooth_idf=True, stopwords=stopwords, sublinear_tf=True, vocabulary=corpus.vocabulary)
bm25q.fit(corpus.documents)

In [368]:
#build BM25 ranker fit to the corpus vocabulary
bm25 = BM25(norm='l2', smooth_idf=True, stopwords=stopwords, sublinear_tf=True, vocabulary=corpus.vocabulary)
bm25.fit(corpus.documents)

In [110]:
testdoc = corpus.documents[1]

In [111]:
testq = tick.name

In [307]:
testqc = [[tick.name, 1]]
for q in tick.tags:
    testqc.append([q, 0.05])
print(testqc)

[['Apple Inc.', 1], ['Technology', 0.05], ['Consumer Electronics', 0.05]]


In [116]:
b=0.8
k1=1.25

In [117]:
tvec = TfidfVectorizer(norm='l2', smooth_idf=True, stop_words=stopwords, sublinear_tf=True, vocabulary=corpus.vocabulary)
cvec = CountVectorizer(stop_words=stopwords, vocabulary=corpus.vocabulary)


In [136]:
X = corpus.documents
cX = corpus.documents

In [137]:
tvec.fit(X)
cvec.fit(X)
vX = tvec.transform(X)
cX = cvec.transform(cX)

In [138]:
avdl = cX.sum(1).mean()
print(avdl)

943.3193717277487


In [139]:
dl = cX.sum(1).A1

In [268]:
# apply CountVectorizer
qw=[]
tfc=[]
Eidf=[]
scr=[]

In [308]:
q = testqc[2]
qe, = cvec.transform([q[0]])
print(qe)
assert sparse.isspmatrix_csr(qe)

  (0, 8890)	1
  (0, 17902)	1


In [309]:
tf = cX.tocsc()[:,qe.indices]
wtf = cX.tocsc()[:,qe.indices] * q[1]
print(wtf.shape)

(191, 2)


In [310]:
idf = tvec._tfidf.idf_[None, qe.indices] - 1.
print(idf)
# idf = idf[0]
# print(idf)

[[0.57536414 2.31305639]]


In [311]:
print(wtf.shape)
print(idf.shape)

(191, 2)
(1, 2)


In [312]:
T = tf.multiply(np.broadcast_to(idf, tf.shape))


In [313]:
#t1 = (1/np.sum(wtf))
t1 = (1/wtf.sum())
print(t1.shape)
t2 = wtf.multiply(np.broadcast_to(idf, wtf.shape))
print(t2.shape)
eidf = (t1*t2).sum()
print(eidf)
eidf = ((1/wtf.sum()) * wtf.multiply(np.broadcast_to(idf, wtf.shape))).sum()
print(eidf)

()
(191, 2)
0.7479009638497243
0.7479009638497243


In [370]:
testqc = [[tick.name, 20]]
for q in tick.tags:
    testqc.append([q, 0.01])
testqc.append(['investing',0.01])
testqc.append(['analysis',0.1])
print(testqc)

[['Apple Inc.', 20], ['Technology', 0.01], ['Consumer Electronics', 0.01], ['investing', 0.01], ['analysis', 0.1]]


In [371]:
tfc=[]
Eidf=[]
for q in testqc:
    print(q)
    qe, = cvec.transform([q[0]])
    tf = cX.tocsc()[:,qe.indices]
    wtf = cX.tocsc()[:,qe.indices] * q[1]
    tfc.append(tf)
    idf = tvec._tfidf.idf_[None, qe.indices] - 1.
    eidf = ((1/wtf.sum()) * wtf.multiply(np.broadcast_to(idf, wtf.shape))).sum()
    
    Eidf.append(eidf)
print(Eidf)

['Apple Inc.', 20]
['Technology', 0.01]
['Consumer Electronics', 0.01]
['investing', 0.01]
['analysis', 0.1]
[0.015748356968139143, 0.08701137698962969, 0.7479009638497242, 0.22705745063534594, 1.791759469228055]


In [372]:
scr=[]
for x in range(0, len(Eidf)):
    denom = tfc[x] + (k1 * (1 - b + b * (dl/avdl)))[:,None]
    numer = tfc[x].multiply(np.broadcast_to(Eidf[x], tfc[x].shape))
    scr.append((numer/denom).sum(1).A1)


In [None]:
tfc=[]
Eidf=[]
for q in testqc:
    qe, = cvec.transform([q[0]])
    tf = cX.tocsc()[:,qe.indices]
    wtf = cX.tocsc()[:,qe.indices] * q[1]
    tfc.append(tf)
    idf = tvec._tfidf.idf_[None, qe.indices] - 1.
    eidf = ((1/wtf.sum()) * wtf.multiply(np.broadcast_to(idf, wtf.shape))).sum()
    
    Eidf.append(eidf)

scr=[]
for x in range(0, len(Eidf)):
    denom = tfc[x] + (k1 * (1 - b + b * (dl/avdl)))[:,None]
    numer = tfc[x].multiply(np.broadcast_to(Eidf[x], tfc[x].shape))
    scr.append((numer/denom).sum(1).A1)

output_score = np.sum(scr, axis=0)

In [373]:

testscr = np.sum(scr, axis=0)
print(testscr)
print(np.argmax(testscr))
print(testscr[np.argmax(testscr)])
print(np.where(testscr==0))

[0.21961446 0.22209635 0.0917946  0.06931279 1.98550641 0.06331261
 0.0615192  0.76522471 0.90929145 0.4982985  0.4391558  0.68877352
 0.         0.0140211  0.         0.0882048  0.08564178 0.08793357
 0.0384438  2.10700883 0.0151299  0.66995838 0.71035864 1.45605629
 0.63428402 0.53256288 0.88177186 0.77661089 0.21087984 0.8188277
 1.00738495 0.         0.44369899 0.01249274 0.01248224 0.58390788
 1.12984333 0.25748005 1.50094353 0.23431583 0.76648242 0.48354665
 0.21926998 0.74614116 1.9938945  0.78068717 0.69514201 0.25008704
 0.07141475 0.72852924 0.20616076 0.71781011 0.91766218 0.09307619
 0.19512356 0.70529882 1.69586766 1.49255266 0.07621664 0.22287237
 0.22190744 0.52559617 0.82445124 0.17343774 0.15106811 0.06268624
 0.07692104 0.61741173 0.22153941 0.71276376 2.0522942  0.84984066
 0.70415523 0.43738289 0.23638181 0.05870399 1.02979744 0.98829101
 0.71149943 0.96930008 0.81009357 0.16550989 1.11626577 0.86588088
 0.68065102 1.44684752 1.67830822 0.2253744  0.75031476 0.76266

In [374]:
print(corpus.documents[np.argmax(testscr)])

 
















Profit Margins Make Difference On Earnings From Lowe’s, Target | Investing.com



















 







 
















































Breaking News













Black Friday SALE: Up to 54% off InvestingPro!
Register here














 






Quotes


All Instrument Types



All Instrument TypesIndicesEquitiesETFsFundsCommoditiesCurrenciesCryptoBondsCertificates 







Please try another search



Search website for: 









Popular News
More


 
Risk assets plunge as virus fears cause post-Thanksgiving blues



 
Oil plunges $10/bbl on new coronavirus variant concerns



 
Wall Street Opens Sharply Lower as Covid Concerns Flare Again; Dow Down 800 Pts






Popular Analysis
More


 
Zoom Stock’s 38% Plunge Makes It An Attractive Buy



 
Is The NASDAQ Composite Running On Empty?



 
3 Stocks Poised For New Highs As Fed Rate Hikes Expected Sooner Than Anticipated















More





















 

Sign In/Free Sign Up 






0

Rec

In [375]:
testsc=bm25.score(testq, corpus.documents)
print(testsc)
print(np.argmax(testsc))
print(np.where(testsc==0))
print(np.mean(testsc))
print(testsc[141])


[0.03278833 0.03464162 0.03663881 0.03534797 0.03742775 0.03670947
 0.03681745 0.03865908 0.01791634 0.03733829 0.03406842 0.03842224
 0.         0.03491197 0.         0.03931505 0.03926478 0.03947661
 0.03767367 0.01319947 0.03876723 0.03634822 0.03548198 0.03790575
 0.03766551 0.03524931 0.03945156 0.02165579 0.03858625 0.03844825
 0.03199028 0.         0.02547845 0.02898359 0.02895752 0.03281793
 0.02794258 0.02745618 0.02082897 0.01799648 0.02118542 0.03889935
 0.03369837 0.02327666 0.02137535 0.02184687 0.02038056 0.02679584
 0.03116039 0.02246439 0.03812491 0.02195865 0.03490528 0.03868128
 0.01443033 0.03697392 0.02854032 0.03831119 0.03856004 0.02670609
 0.02253517 0.02239406 0.03772622 0.03157525 0.02674308 0.03757054
 0.03804713 0.01859964 0.0156894  0.03516166 0.03010901 0.02159042
 0.03566706 0.0288699  0.03848151 0.03404253 0.03366276 0.03874201
 0.0216485  0.03557348 0.03374597 0.0390306  0.03345284 0.03566923
 0.02013829 0.03709963 0.02810668 0.03079518 0.02354914 0.0381

In [15]:
testqc = [[tick.name, 20]]
for q in tick.tags:
    testqc.append([q, 0.01])
testqc.append(['investing',0.01])
testqc.append(['analysis',0.1])
print(testqc)

[['Apple Inc.', 20], ['Technology', 0.01], ['Consumer Electronics', 0.01], ['investing', 0.01], ['analysis', 0.1]]


In [16]:
bm25q.score_expanded(testqc, corpus.documents)

AttributeError: 'super' object has no attribute 'transform'

In [102]:
#create the sub_documents, wrapper to run multiple functions
#passing in the tokenizer to save a little on class dependencies
corpus.sub_divide(tokenizer=tokenizer, cutoff=2, method='sen')
print('Sub-docs:',len(corpus.sub_docs))

Sub-docs: 158


In [103]:
#pass in ticker object and use the BM25 ranker to do a collection of ranking
#this is the same as running each commented function below one by one

corpus.rank_ticker(tick,bm25)

# corpus.rank_docs(tick.name, bm25)
# corpus.rank_doc_tags(tick.tags, bm25)
# corpus.rank_subdocs(tick.name, bm25)
# corpus.rank_subdocs_tags(tick.tags, bm25)
# corpus.rank_titles(tick.name,bm25)

In [104]:
#create 'relevant set' by pruning the sub_doc based on a cutoff value for the ranker score
corpus.prune_subdocs(cutoff=0.2)

In [105]:
relevant_set = corpus.relevant_set
relevant_scores = corpus.relevant_scores
print(len(relevant))

In [106]:
#run the new relevant set through distilled-BERT and get sentiment classifications
sentiments = get_sentiments(relevant_set)

IntProgress(value=0, max=143)

In [107]:
lens=[]
for x in relevant_set.keys():
    for y in range(0, len(relevant_set[x])):
        lens.append(len(relevant_set[x][y]))

In [108]:
avlen=np.mean(lens)

In [109]:
len_weight={}
for x in relevant_set.keys():
    l=[]
    for y in range(0, len(relevant_set[x])):
        l.append(len(relevant_set[x][y])/avlen)
    len_weight[x]=l

In [110]:
adjusted_rel = {}
for x in relevant_scores.keys():
    l=[]
    for y in range(0, len(relevant_scores[x])):
        l.append(relevant_scores[x][y] * len_weight[x][y])
    adjusted_rel[x]=l

In [111]:
# for x in relevant_set.keys():
#     print(x, "relevance:", relevant_scores[x])
#     print("  adjusted r:", adjusted_rel[x])
#     print("  sentiments:", sentiments[x])
    

In [112]:
rw_scores=[]
lrw_scores=[]
for x in relevant_scores.keys():
    for y in range(0, len(relevant_scores[x])):
        rw_scores.append(relevant_scores[x][y] * sentiments[x][y])
        lrw_scores.append(adjusted_rel[x][y] * sentiments[x][y])

In [113]:
print("Average Relevance weighted Sentiment:", np.mean(rw_scores).round(4))
print("Average Length adjusted Relevance weighted sentiment:", np.mean(lrw_scores).round(4))


Average Relevance weighted Sentiment: -0.2161
Average Length adjusted Relevance weighted sentiment: -0.2239
