In [None]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from dateutil import parser as dparser
import urllib.parse
import timeit
import os
import numpy as np
import pandas as pd
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from scipy import sparse
from datasets import load_dataset, DatasetDict, load_metric

import torch
from transformers import AutoModelForSequenceClassification, DistilBertTokenizer
from transformers import pipeline

from util.config import config
from util.pyRanker import BM25
from util.web_query_v2 import web_query
from util.ticker import Ticker
from util.data_manager import data_manager
from util.corpus import Corpus
import ipywidgets as widgets

# Re-Running Sentiment through a different model
Below is an example of using the existing collected data in the _data directory to establish a list of urls and re-collect and re-run the sentiment analysis.

This is useful if you would like to use the framework with a different sentiment model

In the training folder is an example of how we fine tuned the model that we use by default in the corpus.

In [None]:
#defining a new model to be used
model = AutoModelForSequenceClassification.from_pretrained("adp12/cs410finetune1")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
classifier = pipeline(task='sentiment-analysis',model=model,tokenizer=tokenizer)
max_tokens = int(tokenizer.model_max_length)

In [3]:
cfig = config()

In [4]:
datadir = os.path.join(os.getcwd(),"_data")
manager = data_manager(datadir=datadir)

In [12]:
#dictionary of ticker symbols and prime_query selections

qs = {'AMC':'ticker','BIDU':'name',
      'CAT':'name','CCL':'name','GE':'ticker','GM':'both',
      'IBM':'ticker','JNJ':'name','M':'name','MCD':'name',
      'MSFT':'name','NIO':'name','NVDA':'name','PFE':'name',
      'PLUG':'name','QCOM':'name','SNAP':'name','TSLA':'name',
      'VMW':'name','XOM':'name'}

In [13]:
#Collecting a dict of dataframes using the data_manager
dfs = {}
for t in qs.keys():
    df = manager.get_fulldf(ticker=t)
    dfs[t]=df

In [14]:
#looping through ticker dictionary and rebuilding data with new model
for t in qs.keys():
    print(t)
    corpus=Corpus()
    corpus.model=model
    corpus.classifier=classifier
    print('New Corpus Initialized')
    
    tick = Ticker(config=cfig, t=t)
    print('Ticker Created')
    prime_query=qs[t]
    
    wq=web_query(config=cfig)
    wq.results = dfs[t]
    print('Scraping Urls:',len(dfs[t]))
    wq.scrape_results(max_docs='max')

    results = wq.get_results()
    corpus.set_results(results)
    #assign corpus documents as the web query documents and urls
    corpus.set_corpus(documents = wq.documents, urls = wq.urls)

    #Initiating Ranker
    bm25 = BM25(norm='l2', smooth_idf=True, stopwords=corpus.stopwords, sublinear_tf=True)
    bm25.fit(corpus.documents)

    #Building Queries
    corpus.build_queries(ticker=tick, prime=prime_query)
    print('Prime_Query:',corpus.prime_q)
    #Ranking and Pruning
    corpus.rank_docs(ranker=bm25)
    corpus.prune_docs()

    #Sub-Dividing
    print('Sub-Dividing Documents')
    #corpus.sub_divide(tokenizer=tokenizer, cutoff=2, method='sen')
    corpus.sub_divide(cutoff=2, method='sen')
    print('Sub-docs:',len(corpus.sub_list))
    print('Source count:', len(corpus.sub_docs))

    #Ranking and Pruning
    corpus.rank_subdocs(ranker=bm25)
    corpus.prune_subdocs()

    print("Number of sub_docs:",len(corpus.sub_list))
    t = np.array(corpus.sub_list_scores)
    z=np.where(t==0.0)
    print("Number zero ranked:", len(z[0]))
    print("percent of useless subdocs:", round((len(z[0])/len(corpus.sub_list))*100,2),"%")

    #Relevant Set
    corpus.make_relevant()
    corpus.rank_relevant(ranker=bm25)
    #shouldn't be any zeros because of the pruned subdocs, but cant hurt
    corpus.prune_relevant(method='finite',cutoff=0.0)
    print('relevant sources:',len(corpus.relevant_set))
    print('relevant sub_docs:',len(corpus.rel_list))

    #Sentiments
    print('Getting Sentiments')
    corpus.get_sentiments()

    #Building Dataset
    print('Build Data')
    corpus.data_preprocess()
    corpus.build_fulldf()
    corpus.build_pricedf(ticker=tick)


    manager.store_data(ticker=tick.ticker, full_df=corpus.full_df, price_df=corpus.price_df)
    manager.get_ticker_list()
    manager.get_data_info(ticker=tick.ticker)

AMC
New Corpus Initialized
Ticker Created
Scraping Urls: 652


IntProgress(value=0, max=652)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: AMC
Sub-Dividing Documents


IntProgress(value=0, max=643)

Token indices sequence length is longer than the specified maximum sequence length for this model (611 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 4365
Source count: 515
Number of sub_docs: 4365
Number zero ranked: 3375
percent of useless subdocs: 77.32 %
relevant sources: 509
relevant sub_docs: 990
Getting Sentiments


IntProgress(value=0, max=990)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


BIDU
New Corpus Initialized
Ticker Created
Scraping Urls: 705


IntProgress(value=0, max=705)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Baidu
Sub-Dividing Documents


IntProgress(value=0, max=703)

Token indices sequence length is longer than the specified maximum sequence length for this model (1489 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 3532
Source count: 307
Number of sub_docs: 3532
Number zero ranked: 2773
percent of useless subdocs: 78.51 %
relevant sources: 305
relevant sub_docs: 759
Getting Sentiments


IntProgress(value=0, max=759)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


CAT
New Corpus Initialized
Ticker Created
Scraping Urls: 810


IntProgress(value=0, max=810)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Caterpillar
Sub-Dividing Documents


IntProgress(value=0, max=798)

Token indices sequence length is longer than the specified maximum sequence length for this model (1270 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 4860
Source count: 400
Number of sub_docs: 4860
Number zero ranked: 3730
percent of useless subdocs: 76.75 %
relevant sources: 400
relevant sub_docs: 1130
Getting Sentiments


IntProgress(value=0, max=1130)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


CCL
New Corpus Initialized
Ticker Created
Scraping Urls: 878


IntProgress(value=0, max=878)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Carnival&plc
Sub-Dividing Documents


IntProgress(value=0, max=869)

Token indices sequence length is longer than the specified maximum sequence length for this model (941 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 3535
Source count: 364
Number of sub_docs: 3535
Number zero ranked: 2926
percent of useless subdocs: 82.77 %
relevant sources: 355
relevant sub_docs: 609
Getting Sentiments


IntProgress(value=0, max=609)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


GE
New Corpus Initialized
Ticker Created
Scraping Urls: 864


IntProgress(value=0, max=864)

Prime_Query: GeneralElectric
Sub-Dividing Documents


IntProgress(value=0, max=863)

Token indices sequence length is longer than the specified maximum sequence length for this model (936 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 21
Source count: 4
Number of sub_docs: 21
Number zero ranked: 19
percent of useless subdocs: 90.48 %
relevant sources: 2
relevant sub_docs: 2
Getting Sentiments


IntProgress(value=0, max=2)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


GM
New Corpus Initialized
Ticker Created
Scraping Urls: 1188


IntProgress(value=0, max=1188)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: GeneralMotors GM
Sub-Dividing Documents


IntProgress(value=0, max=1186)

Token indices sequence length is longer than the specified maximum sequence length for this model (1532 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 6737
Source count: 698
Number of sub_docs: 6737
Number zero ranked: 5525
percent of useless subdocs: 82.01 %
relevant sources: 696
relevant sub_docs: 1212
Getting Sentiments


IntProgress(value=0, max=1212)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


IBM
New Corpus Initialized
Ticker Created
Scraping Urls: 1184


IntProgress(value=0, max=1184)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: IBM
Sub-Dividing Documents


IntProgress(value=0, max=1172)

Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 4213
Source count: 399
Number of sub_docs: 4213
Number zero ranked: 3393
percent of useless subdocs: 80.54 %
relevant sources: 398
relevant sub_docs: 820
Getting Sentiments


IntProgress(value=0, max=820)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


JNJ
New Corpus Initialized
Ticker Created
Scraping Urls: 1506


IntProgress(value=0, max=1506)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Johnson&Johnson
Sub-Dividing Documents


IntProgress(value=0, max=1501)

Token indices sequence length is longer than the specified maximum sequence length for this model (1358 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 8318
Source count: 735
Number of sub_docs: 8318
Number zero ranked: 7219
percent of useless subdocs: 86.79 %
relevant sources: 729
relevant sub_docs: 1099
Getting Sentiments


IntProgress(value=0, max=1099)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


M
New Corpus Initialized
Ticker Created
Scraping Urls: 1234


IntProgress(value=0, max=1234)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Macy's
Sub-Dividing Documents


IntProgress(value=0, max=1230)

Token indices sequence length is longer than the specified maximum sequence length for this model (761 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 2106
Source count: 239
Number of sub_docs: 2106
Number zero ranked: 1680
percent of useless subdocs: 79.77 %
relevant sources: 235
relevant sub_docs: 426
Getting Sentiments


IntProgress(value=0, max=426)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


MCD
New Corpus Initialized
Ticker Created
Scraping Urls: 1637


IntProgress(value=0, max=1637)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: McDonald's
Sub-Dividing Documents


IntProgress(value=0, max=1631)

Token indices sequence length is longer than the specified maximum sequence length for this model (2839 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 5593
Source count: 529
Number of sub_docs: 5593
Number zero ranked: 4646
percent of useless subdocs: 83.07 %
relevant sources: 525
relevant sub_docs: 947
Getting Sentiments


IntProgress(value=0, max=947)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


MSFT
New Corpus Initialized
Ticker Created
Scraping Urls: 2836


IntProgress(value=0, max=2836)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Microsoft
Sub-Dividing Documents


IntProgress(value=0, max=2824)

Token indices sequence length is longer than the specified maximum sequence length for this model (1119 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 19490
Source count: 2082
Number of sub_docs: 19490
Number zero ranked: 16456
percent of useless subdocs: 84.43 %
relevant sources: 2030
relevant sub_docs: 3034
Getting Sentiments


IntProgress(value=0, max=3034)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


NIO
New Corpus Initialized
Ticker Created
Scraping Urls: 2354


IntProgress(value=0, max=2354)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHA

Prime_Query: NIO
Sub-Dividing Documents


IntProgress(value=0, max=2350)

Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 19290
Source count: 1469
Number of sub_docs: 19290
Number zero ranked: 16615
percent of useless subdocs: 86.13 %
relevant sources: 1402
relevant sub_docs: 2675
Getting Sentiments


IntProgress(value=0, max=2675)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


NVDA
New Corpus Initialized
Ticker Created
Scraping Urls: 1591


IntProgress(value=0, max=1591)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: NVIDIA
Sub-Dividing Documents


IntProgress(value=0, max=1584)

Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 6216
Source count: 524
Number of sub_docs: 6216
Number zero ranked: 5609
percent of useless subdocs: 90.23 %
relevant sources: 361
relevant sub_docs: 607
Getting Sentiments


IntProgress(value=0, max=607)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


PFE
New Corpus Initialized
Ticker Created
Scraping Urls: 2280


IntProgress(value=0, max=2280)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Pfizer
Sub-Dividing Documents


IntProgress(value=0, max=2267)

Token indices sequence length is longer than the specified maximum sequence length for this model (2370 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 12209
Source count: 1254
Number of sub_docs: 12209
Number zero ranked: 9990
percent of useless subdocs: 81.82 %
relevant sources: 1122
relevant sub_docs: 2219
Getting Sentiments


IntProgress(value=0, max=2219)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


PLUG
New Corpus Initialized
Ticker Created
Scraping Urls: 1809


IntProgress(value=0, max=1809)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: PlugPower
Sub-Dividing Documents


IntProgress(value=0, max=1806)

Token indices sequence length is longer than the specified maximum sequence length for this model (1138 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 62
Source count: 13
Number of sub_docs: 62
Number zero ranked: 48
percent of useless subdocs: 77.42 %
relevant sources: 13
relevant sub_docs: 14
Getting Sentiments


IntProgress(value=0, max=14)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


QCOM
New Corpus Initialized
Ticker Created
Scraping Urls: 1856


IntProgress(value=0, max=1856)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: QUALMMorporated
Very low relevance scores, attempting requery
Sub-Dividing Documents


IntProgress(value=0, max=1854)

Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



relevant sources: 465
relevant sub_docs: 629
Getting Sentiments


IntProgress(value=0, max=629)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


SNAP
New Corpus Initialized
Ticker Created
Scraping Urls: 2289


IntProgress(value=0, max=2289)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Snap
Sub-Dividing Documents


IntProgress(value=0, max=2285)

Token indices sequence length is longer than the specified maximum sequence length for this model (1354 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 8504
Source count: 816
Number of sub_docs: 8504
Number zero ranked: 7366
percent of useless subdocs: 86.62 %
relevant sources: 673
relevant sub_docs: 1138
Getting Sentiments


IntProgress(value=0, max=1138)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


TSLA
New Corpus Initialized
Ticker Created
Scraping Urls: 979


IntProgress(value=0, max=979)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Prime_Query: Tesla
Very low relevance scores, attempting requery
Sub-Dividing Documents


IntProgress(value=0, max=969)

Token indices sequence length is longer than the specified maximum sequence length for this model (799 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 8684
Source count: 903
Number of sub_docs: 8684
Number zero ranked: 6974
percent of useless subdocs: 80.31 %
relevant sources: 859
relevant sub_docs: 1710
Getting Sentiments


IntProgress(value=0, max=1710)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


VMW
New Corpus Initialized
Ticker Created
Scraping Urls: 68


IntProgress(value=0, max=68)

Prime_Query: VMware
Sub-Dividing Documents


IntProgress(value=0, max=67)

Token indices sequence length is longer than the specified maximum sequence length for this model (1683 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 647
Source count: 66
Number of sub_docs: 647
Number zero ranked: 462
percent of useless subdocs: 71.41 %
relevant sources: 66
relevant sub_docs: 185
Getting Sentiments


IntProgress(value=0, max=185)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


XOM
New Corpus Initialized
Ticker Created
Scraping Urls: 260


IntProgress(value=0, max=260)

Prime_Query: ExxonMobil
Sub-Dividing Documents


IntProgress(value=0, max=259)

Token indices sequence length is longer than the specified maximum sequence length for this model (1867 > 512). Running this sequence through the model will result in indexing errors


Sub-docs: 3133
Source count: 248
Number of sub_docs: 3133
Number zero ranked: 2678
percent of useless subdocs: 85.48 %
relevant sources: 248
relevant sub_docs: 455
Getting Sentiments


IntProgress(value=0, max=455)

Build Data


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
