In [272]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pdfplumber
import re
# Some functions built by the user
import myfunction as mf
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import copy
import scipy as sp
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick_Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [203]:
def read_pdf(ticker):
    text_all =''
    with pdfplumber.open(ticker) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            text_all += text
        #'text_'+str(ticker)=text_all
    return text_all

In [204]:
pdf_list = ['AHT_Q1.pdf','AZN_Q1.pdf','BARC_Q1.pdf','BP_Q1.pdf','BTA_Q1.pdf','GSK_Q1.pdf','HSBA_Q1.pdf','IAG_Q1.pdf','LLOY_Q1.pdf','NWG_Q1.pdf','RDSA_Q1.pdf','RRS_Q1.pdf','SHP_Q1.pdf','SKY_Q1.pdf','SN_Q1.pdf','TUI_Q1.pdf']

In [205]:
df_list = []

for i in range(len(pdf_list)):
    pdf = read_pdf(pdf_list[i])
    df_list.append(pdf)

df = pd.DataFrame(df_list,columns=['document'])

In [206]:
pdf_name = []
for i in range(len(pdf_list)):
    a = pdf_list[i].replace('_Q1.pdf','')
    pdf_name.append(a)
    
df.index = pdf_name

In [207]:
# Clean text from punctuation and convert to lower case
documents = df['document']
for i in range(len(documents)):
    # convert to lower case
    documents[i] = documents[i].lower()
    # take out non words
    documents[i] = re.sub(r'\W',' ',documents[i])
    # drop multiple spaces
    documents[i] = re.sub(r'\s+',' ',documents[i])
    # drop \n
    documents[i] = re.sub(r'\!|\?|\n',' ',documents[i])

In [208]:
df

Unnamed: 0,document
AHT,company name ashtead market cap 8781 38027408 ...
AZN,company name astrazeneca market cap 66384 3788...
BARC,company name barclays market cap 35913 2007486...
BP,company name bp market cap 109281 133456 bloom...
BTA,company name bt market cap 30783 0234251 bloom...
GSK,company name glaxo market cap 70030 4659254 bl...
HSBA,company name hsbc market cap 140 569 05 bloomb...
IAG,company name iag market cap 13893 0873112 bloo...
LLOY,company name lloyds market cap 46801 5085506 b...
NWG,company name natwest market cap 32276 2546622 ...


In [209]:
# Remove stopwords (words that do not add any value) and empty
stop_words = set(stopwords.words('english')).union([""])
filtered_documents = copy.deepcopy(documents)
for j in range(len(documents)):
    filtered_documents[j] = [w for w in documents[j].split(" ") if not w in stop_words]

In [210]:
filtered_documents

AHT     [company, name, ashtead, market, cap, 8781, 38...
AZN     [company, name, astrazeneca, market, cap, 6638...
BARC    [company, name, barclays, market, cap, 35913, ...
BP      [company, name, bp, market, cap, 109281, 13345...
BTA     [company, name, bt, market, cap, 30783, 023425...
GSK     [company, name, glaxo, market, cap, 70030, 465...
HSBA    [company, name, hsbc, market, cap, 140, 569, 0...
IAG     [company, name, iag, market, cap, 13893, 08731...
LLOY    [company, name, lloyds, market, cap, 46801, 50...
NWG     [company, name, natwest, market, cap, 32276, 2...
RDSA    [company, name, shell, market, cap, 211264, 88...
RRS     [company, name, randgold, resources, ltd, comp...
SHP     [company, name, shire, market, cap, 34960, 685...
SKY     [company, name, sky, ltd, market, cap, 15926, ...
SN      [company, name, smith, nephew, market, cap, 11...
TUI     [company, name, tui, market, cap, 9, 647, 83, ...
Name: document, dtype: object

In [211]:
# Split documents into words-components and put them into a list + create a set with all the words (non duplicates)
list_documents = []
total = []
for j in range(len(filtered_documents)):
    this_document = filtered_documents[j]
    list_documents.append(this_document)
    total = set(total).union(set(this_document))

In [212]:
list_documents

[['company',
  'name',
  'ashtead',
  'market',
  'cap',
  '8781',
  '38027408',
  'bloomberg',
  'estimates',
  'eps',
  'company',
  'ticker',
  'aht',
  'ln',
  'current',
  'px',
  '1759',
  'current',
  'quarter',
  '0',
  '328',
  'date',
  '2017',
  '09',
  '12',
  'ytd',
  'change',
  '162',
  'current',
  'year',
  '1',
  '187',
  'event',
  'description',
  'q1',
  '2018',
  'earnings',
  'call',
  'ytd',
  'change',
  '0',
  '0',
  'bloomberg',
  'estimates',
  'sales',
  'current',
  'quarter',
  '957',
  'page',
  '1',
  'current',
  'year',
  '3569',
  '786',
  'q1',
  '2018',
  'earnings',
  'call',
  'company',
  'participants',
  'cid',
  '127',
  'geoffrey',
  'drabble',
  'ceo',
  'director',
  'cid',
  '127',
  'suzanne',
  'wood',
  'group',
  'finance',
  'director',
  'director',
  'participants',
  'cid',
  '127',
  'andrew',
  'murphy',
  'analyst',
  'cid',
  '127',
  'andrew',
  'richard',
  'farnell',
  'vp',
  'equity',
  'analyst',
  'cid',
  '127',
  'dav

In [213]:
# Count words in each sentence
wordDict = []
# Create a dictionary with all words
Doc_count_word = dict.fromkeys(total, 0)
for j in range(len(filtered_documents)):
    this_wordDict = dict.fromkeys(total, 0) 
    #print(this_wordDict)
    for word in filtered_documents[j]:
        #print(word)
        #print(this_wordDict[word])
        # count how many documents contain the word
        if this_wordDict[word] == 0 : Doc_count_word[word] += 1
        #print(word)
        # plus count how many times the word is contained into the document
        this_wordDict[word] += 1
        #print(this_wordDict[word])
    wordDict.append(this_wordDict) 

In [214]:
wordDict

[{'longevity': 0,
  'decreasing': 0,
  'therapy': 0,
  'complicated': 0,
  'frame': 0,
  'clinically': 0,
  'leaders': 0,
  'hence': 0,
  'practicing': 0,
  'tails': 0,
  'regular': 0,
  'expects': 0,
  'revise': 0,
  '218': 0,
  'disincentive': 0,
  'claimed': 0,
  'vertical': 0,
  'spring': 1,
  'follows': 0,
  'finkelstein': 0,
  'recurring': 0,
  'doses': 0,
  'dice': 0,
  'leases': 0,
  'concerned': 0,
  'vice': 0,
  'window': 0,
  'laser': 0,
  'initiate': 0,
  'said': 12,
  'real': 1,
  'indirect': 1,
  'flaura': 0,
  'failing': 0,
  'hunt': 0,
  'credits': 0,
  'prevailing': 0,
  'regimens': 0,
  'trucks': 1,
  'implies': 0,
  'timothy': 0,
  'binds': 0,
  'persuade': 0,
  'ats': 0,
  '328': 16,
  '199': 0,
  'trx': 0,
  'realizes': 0,
  'raã': 0,
  'projecting': 0,
  'beautiful': 0,
  'adjustments': 0,
  'destroy': 0,
  'metabolism': 0,
  'stop': 2,
  'drag': 5,
  'built': 2,
  'half': 4,
  'doubled': 0,
  'diagnosis': 0,
  'zambia': 0,
  'lifespans': 0,
  'caused': 0,
  'home

In [215]:
# compute the term frequency (n times words / total words)
# https://www.freecodecamp.org/news/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3/
tfAll = []
for j in range(len(wordDict)):
    tfAll.append(mf.computeTF(wordDict[j], list_documents[j]))

In [216]:
tfAll

[{'longevity': 0.0,
  'decreasing': 0.0,
  'therapy': 0.0,
  'complicated': 0.0,
  'frame': 0.0,
  'clinically': 0.0,
  'leaders': 0.0,
  'hence': 0.0,
  'practicing': 0.0,
  'tails': 0.0,
  'regular': 0.0,
  'expects': 0.0,
  'revise': 0.0,
  '218': 0.0,
  'disincentive': 0.0,
  'claimed': 0.0,
  'vertical': 0.0,
  'spring': 0.00017406440382941688,
  'follows': 0.0,
  'finkelstein': 0.0,
  'recurring': 0.0,
  'doses': 0.0,
  'dice': 0.0,
  'leases': 0.0,
  'concerned': 0.0,
  'vice': 0.0,
  'window': 0.0,
  'laser': 0.0,
  'initiate': 0.0,
  'said': 0.0020887728459530026,
  'real': 0.00017406440382941688,
  'indirect': 0.00017406440382941688,
  'flaura': 0.0,
  'failing': 0.0,
  'hunt': 0.0,
  'credits': 0.0,
  'prevailing': 0.0,
  'regimens': 0.0,
  'trucks': 0.00017406440382941688,
  'implies': 0.0,
  'timothy': 0.0,
  'binds': 0.0,
  'persuade': 0.0,
  'ats': 0.0,
  '328': 0.00278503046127067,
  '199': 0.0,
  'trx': 0.0,
  'realizes': 0.0,
  'raã': 0.0,
  'projecting': 0.0,
  'beau

In [217]:
# compute Inverse Data Frequency (idf) log10(number of documents on / number of documents containing the word)
idfs = mf.computeIDF(Doc_count_word, len(wordDict))
# Compute TFidf
TFidf = []
for j in range(len(wordDict)):
    TFidf.append(mf.computeTFIDF(tfAll[j], idfs))
df_TFidf = pd.DataFrame(TFidf)

In [218]:
df_TFidf

Unnamed: 0,longevity,decreasing,therapy,complicated,frame,clinically,leaders,hence,practicing,tails,...,1412,dark,slowly,undermining,pile,corrib,keeps,repositioning,fc,units
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.000979,0.0,0.0,0.000243,6.8e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000122,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,6.5e-05,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8e-05,0.0,0.0
3,0.0,0.0,0.0,0.0,0.001152,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000116,0.0,0.0,0.0,0.0,0.0,0.0,6.8e-05
4,0.0,0.0,0.0,0.0,0.0,0.0,6.4e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000152,0.000114,0.0,0.0,0.0,0.0,0.000161
5,0.0,0.0,0.000583,0.000193,9.7e-05,0.000145,8.1e-05,0.0,0.0,0.0,...,0.003284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000193,6.8e-05
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000421,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000151
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000145,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,7.7e-05,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [219]:
# Import dictionaries (words)
positives_all = list(pd.read_csv("Positive.csv")["both"].dropna())
negatives_all = list(pd.read_csv("Negative.csv")["both"].dropna())
positives_LM = list(pd.read_csv("Positive.csv")["LM"].dropna())
negatives_LM = list(pd.read_csv("Negative.csv")["LM"].dropna())
positives_GI = list(pd.read_csv("Positive.csv")["GI"].dropna())
negatives_GI = list(pd.read_csv("Negative.csv")["GI"].dropna())

In [220]:
%%time
Doc_word_LM = mf.map_word_to_sent(Doc_count_word, positives_LM, negatives_LM, similarity_check = False)
Doc_word_GI = mf.map_word_to_sent(Doc_count_word, positives_GI, negatives_GI, similarity_check = False)
Doc_word_all = mf.map_word_to_sent(Doc_count_word, positives_all, negatives_all, similarity_check = True)

CPU times: total: 16min 47s
Wall time: 16min 53s


In [226]:
Doc_word_LM

{'longevity': 0,
 'decreasing': 0,
 'therapy': 0,
 'complicated': -1,
 'frame': 0,
 'clinically': 0,
 'leaders': 0,
 'hence': 0,
 'practicing': 0,
 'tails': 0,
 'regular': 0,
 'expects': 0,
 'revise': 0,
 '218': 0,
 'disincentive': 0,
 'claimed': 0,
 'vertical': 0,
 'spring': 0,
 'follows': 0,
 'finkelstein': 0,
 'recurring': 0,
 'doses': 0,
 'dice': 0,
 'leases': 0,
 'concerned': -1,
 'vice': 0,
 'window': 0,
 'laser': 0,
 'initiate': 0,
 'said': 0,
 'real': 0,
 'indirect': 0,
 'flaura': 0,
 'failing': -1,
 'hunt': 0,
 'credits': 0,
 'prevailing': 0,
 'regimens': 0,
 'trucks': 0,
 'implies': 0,
 'timothy': 0,
 'binds': 0,
 'persuade': 0,
 'ats': 0,
 '328': 0,
 '199': 0,
 'trx': 0,
 'realizes': 0,
 'raã': 0,
 'projecting': 0,
 'beautiful': 1,
 'adjustments': 0,
 'destroy': -1,
 'metabolism': 0,
 'stop': 0,
 'drag': -1,
 'built': 0,
 'half': 0,
 'doubled': 0,
 'diagnosis': 0,
 'zambia': 0,
 'lifespans': 0,
 'caused': 0,
 'homes': 0,
 'introductions': 0,
 '177': 0,
 'halifax': 0,
 '69463

In [227]:
Doc_word_GI

{'longevity': 1,
 'decreasing': 0,
 'therapy': 0,
 'complicated': 0,
 'frame': 0,
 'clinically': 0,
 'leaders': 0,
 'hence': 0,
 'practicing': 0,
 'tails': 0,
 'regular': 0,
 'expects': 0,
 'revise': 0,
 '218': 0,
 'disincentive': 0,
 'claimed': 0,
 'vertical': 0,
 'spring': 0,
 'follows': 0,
 'finkelstein': 0,
 'recurring': 0,
 'doses': 0,
 'dice': 0,
 'leases': 0,
 'concerned': 0,
 'vice': 0,
 'window': 0,
 'laser': 0,
 'initiate': 0,
 'said': 0,
 'real': 0,
 'indirect': 0,
 'flaura': 0,
 'failing': 0,
 'hunt': 0,
 'credits': 0,
 'prevailing': 0,
 'regimens': 0,
 'trucks': 0,
 'implies': 0,
 'timothy': 0,
 'binds': 0,
 'persuade': 0,
 'ats': 0,
 '328': 0,
 '199': 0,
 'trx': 0,
 'realizes': 0,
 'raã': 0,
 'projecting': 0,
 'beautiful': 0,
 'adjustments': 0,
 'destroy': -1,
 'metabolism': 0,
 'stop': 0,
 'drag': -1,
 'built': 0,
 'half': 0,
 'doubled': 0,
 'diagnosis': 0,
 'zambia': 0,
 'lifespans': 0,
 'caused': 0,
 'homes': 0,
 'introductions': 0,
 '177': 0,
 'halifax': 0,
 '6946359'

In [228]:
Doc_word_all

{'longevity': 1,
 'decreasing': -0.8235294117647058,
 'therapy': 0.6666666666666666,
 'complicated': -1,
 'frame': 0.8888888888888888,
 'clinically': -0.8,
 'leaders': 0.9230769230769231,
 'hence': -0.7272727272727273,
 'practicing': -0.7368421052631579,
 'tails': -0.8,
 'regular': 1,
 'expects': -0.7142857142857143,
 'revise': 0.8333333333333334,
 '218': 0.2857142857142857,
 'disincentive': -0.96,
 'claimed': 0.875,
 'vertical': 0.8,
 'spring': -0.8333333333333334,
 'follows': -0.8,
 'finkelstein': -0.625,
 'recurring': -0.8,
 'doses': -0.8,
 'dice': -0.75,
 'leases': -0.8333333333333334,
 'concerned': -1,
 'vice': -0.8571428571428571,
 'window': 0.7272727272727273,
 'laser': -0.8,
 'initiate': 1,
 'said': -0.8571428571428571,
 'real': 0.8888888888888888,
 'indirect': -1,
 'flaura': 0.7272727272727273,
 'failing': -1,
 'hunt': -0.8888888888888888,
 'credits': -0.8235294117647058,
 'prevailing': 0.8235294117647058,
 'regimens': 0.875,
 'trucks': -0.7142857142857143,
 'implies': -0.8235

In [229]:
# Compute scores by weighting the words by the weights obtaining using TFidf
df_scores_GI = copy.deepcopy(df_TFidf)
df_scores_LM = copy.deepcopy(df_TFidf)
df_scores_all = copy.deepcopy(df_TFidf)
for this_key in df_scores_all.keys():
    df_scores_GI[this_key] = df_scores_GI[this_key]*Doc_word_GI[this_key]
    df_scores_LM[this_key] = df_scores_LM[this_key]*Doc_word_LM[this_key]
    df_scores_all[this_key] = df_scores_all[this_key]*Doc_word_all[this_key]

In [230]:
df_scores_GI

Unnamed: 0,longevity,decreasing,therapy,complicated,frame,clinically,leaders,hence,practicing,tails,...,1412,dark,slowly,undermining,pile,corrib,keeps,repositioning,fc,units
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [231]:
df_scores_LM

Unnamed: 0,longevity,decreasing,therapy,complicated,frame,clinically,leaders,hence,practicing,tails,...,1412,dark,slowly,undermining,pile,corrib,keeps,repositioning,fc,units
0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.000116,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.000152,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,-0.000193,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [232]:
df_scores_all

Unnamed: 0,longevity,decreasing,therapy,complicated,frame,clinically,leaders,hence,practicing,tails,...,1412,dark,slowly,undermining,pile,corrib,keeps,repositioning,fc,units
0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0
1,0.0,-0.0,0.000653,-0.0,0.0,-0.000195,6.3e-05,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,8.8e-05,-0.0,0.0,0.0
2,0.0,-0.0,0.0,-0.0,5.8e-05,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-7.6e-05,0.0,0.0
3,0.0,-0.0,0.0,-0.0,0.001024,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.000116,-0.0,-0.0,-0.0,0.0,-0.0,0.0,5.4e-05
4,0.0,-0.0,0.0,-0.0,0.0,-0.0,5.9e-05,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.000152,-8.6e-05,-0.0,0.0,-0.0,0.0,0.000129
5,0.0,-0.0,0.000389,-0.000193,8.6e-05,-0.000116,7.5e-05,-0.0,-0.0,-0.0,...,0.001194,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.000129,5.5e-05
6,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.000329,0.0,0.0
7,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.000121
8,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.000106,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0
9,0.0,-0.0,0.0,-0.0,0.0,-0.0,7.1e-05,-0.0,-0.0,-0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,0.0


In [233]:
# Put in a unique dataframe
final_df = pd.DataFrame()
final_df["sentiment_index_GI"] = df_scores_GI.sum(axis = 1)
final_df["sentiment_index_LM"] = df_scores_LM.sum(axis = 1)
final_df["sentiment_index_all"] = df_scores_all.sum(axis = 1)
final_df.index = df.index
#final_df["Dates"] = df.index
#final_df = final_df.set_index('Dates')

In [234]:
final_df

Unnamed: 0,sentiment_index_GI,sentiment_index_LM,sentiment_index_all
AHT,0.00172,-0.000587,0.071078
AZN,-0.002639,0.000598,0.031524
BARC,0.004274,-0.003712,0.030383
BP,0.005113,0.000105,0.014848
BTA,0.008141,-0.001853,0.065046
GSK,0.004143,0.002335,0.049581
HSBA,0.003844,-0.001288,0.036329
IAG,0.003047,-0.003165,0.051538
LLOY,0.003195,-0.003104,0.037485
NWG,0.004157,-0.003489,0.038805


# Fundamentals

In [258]:
revenue_df = pd.read_excel('Revenue_all.xlsx')
net_income_df = pd.read_excel('NetIncome_all.xlsx')

In [259]:
revenue_df.set_index(['Company'],inplace= True)
net_income_df.set_index(['Company'],inplace= True)

In [260]:
display(revenue_df,net_income_df)

Unnamed: 0_level_0,Q1 2018,Q2 2018,Q3 2018,Q4 2018,Q1 2019
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AHT,880.1,1019.0,916.1,890.8,1047.4
AZN,3721.9,3791.6,4099.2,4990.1,4218.3
BARC,5358.0,5576.0,5129.0,5073.0,5252.0
BP,49001.8,55486.8,61003.2,58849.0,50949.3
BTA,5837.0,5951.0,5970.0,5967.0,5715.0
GSK,7222.0,7310.0,8092.0,8197.0,7661.0
HSBA,9854.7,9986.1,10592.0,9872.1,11083.9
IAG,4435.7,5419.5,6373.1,5336.2,4639.8
LLOY,4492.0,4886.0,4686.0,4570.0,4489.0
NWG,3530.0,3662.0,3882.0,3260.0,3324.0


Unnamed: 0_level_0,Q1 2018,Q2 2018,Q3 2018,Q4 2018,Q1 2019
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AHT,150.0,190.1,150.5,115.9,209.9
AZN,202.0,405.3,193.1,884.2,644.5
BARC,824.41,1129.1,1135.1,148.0,1087.4
BP,2046.2,2556.4,3042.3,1377.1,2392.2
BTA,579.98,582.3,593.8,821.2,549.0
GSK,1090.0,1131.0,1552.0,1408.0,1339.0
HSBA,2946.0,3021.0,3134.8,1442.7,3274.8
IAG,136.9,568.3,1013.1,437.6,61.1
LLOY,1399.72,1352.6,1664.4,1201.0,1536.2
NWG,1013.13,897.1,1010.1,705.6,903.8


In [261]:
revenue_df['Q2_2018_growth']=(revenue_df['Q2 2018']-revenue_df['Q1 2018'])/revenue_df['Q1 2018']
revenue_df['Q3_2018_growth']=(revenue_df['Q3 2018']-revenue_df['Q1 2018'])/revenue_df['Q1 2018']
revenue_df['Q4_2018_growth']=(revenue_df['Q4 2018']-revenue_df['Q1 2018'])/revenue_df['Q1 2018']
revenue_df['Q1_2019_growth']=(revenue_df['Q1 2019']-revenue_df['Q1 2018'])/revenue_df['Q1 2018']
display(revenue_df)

Unnamed: 0_level_0,Q1 2018,Q2 2018,Q3 2018,Q4 2018,Q1 2019,Q2_2018_growth,Q3_2018_growth,Q4_2018_growth,Q1_2019_growth
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AHT,880.1,1019.0,916.1,890.8,1047.4,0.157823,0.040904,0.012158,0.190092
AZN,3721.9,3791.6,4099.2,4990.1,4218.3,0.018727,0.101373,0.34074,0.133373
BARC,5358.0,5576.0,5129.0,5073.0,5252.0,0.040687,-0.04274,-0.053191,-0.019784
BP,49001.8,55486.8,61003.2,58849.0,50949.3,0.132342,0.244918,0.200956,0.039743
BTA,5837.0,5951.0,5970.0,5967.0,5715.0,0.019531,0.022786,0.022272,-0.020901
GSK,7222.0,7310.0,8092.0,8197.0,7661.0,0.012185,0.120465,0.135004,0.060786
HSBA,9854.7,9986.1,10592.0,9872.1,11083.9,0.013334,0.074817,0.001766,0.124732
IAG,4435.7,5419.5,6373.1,5336.2,4639.8,0.221791,0.436774,0.203012,0.046013
LLOY,4492.0,4886.0,4686.0,4570.0,4489.0,0.087711,0.043188,0.017364,-0.000668
NWG,3530.0,3662.0,3882.0,3260.0,3324.0,0.037394,0.099717,-0.076487,-0.058357


In [262]:
revenue_growth = revenue_df.iloc[:,5:].astype(float)
display(revenue_growth)

Unnamed: 0_level_0,Q2_2018_growth,Q3_2018_growth,Q4_2018_growth,Q1_2019_growth
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AHT,0.157823,0.040904,0.012158,0.190092
AZN,0.018727,0.101373,0.34074,0.133373
BARC,0.040687,-0.04274,-0.053191,-0.019784
BP,0.132342,0.244918,0.200956,0.039743
BTA,0.019531,0.022786,0.022272,-0.020901
GSK,0.012185,0.120465,0.135004,0.060786
HSBA,0.013334,0.074817,0.001766,0.124732
IAG,0.221791,0.436774,0.203012,0.046013
LLOY,0.087711,0.043188,0.017364,-0.000668
NWG,0.037394,0.099717,-0.076487,-0.058357


In [291]:
# For Pearson Test
revenue_lexicon = pd.concat([final_df,revenue_growth],axis=1)
display(revenue_lexicon)

Unnamed: 0,sentiment_index_GI,sentiment_index_LM,sentiment_index_all,Q2_2018_growth,Q3_2018_growth,Q4_2018_growth,Q1_2019_growth
AHT,0.00172,-0.000587,0.071078,0.157823,0.040904,0.012158,0.190092
AZN,-0.002639,0.000598,0.031524,0.018727,0.101373,0.34074,0.133373
BARC,0.004274,-0.003712,0.030383,0.040687,-0.04274,-0.053191,-0.019784
BP,0.005113,0.000105,0.014848,0.132342,0.244918,0.200956,0.039743
BTA,0.008141,-0.001853,0.065046,0.019531,0.022786,0.022272,-0.020901
GSK,0.004143,0.002335,0.049581,0.012185,0.120465,0.135004,0.060786
HSBA,0.003844,-0.001288,0.036329,0.013334,0.074817,0.001766,0.124732
IAG,0.003047,-0.003165,0.051538,0.221791,0.436774,0.203012,0.046013
LLOY,0.003195,-0.003104,0.037485,0.087711,0.043188,0.017364,-0.000668
NWG,0.004157,-0.003489,0.038805,0.037394,0.099717,-0.076487,-0.058357


In [292]:
def correlation(growth,final_df,final_df_lexicon):
    
    growth_lexicon = copy.deepcopy(growth)
    growth_lexicon.index = final_df.index
    growth_lexicon = pd.concat([final_df_lexicon,growth_lexicon], axis=1 ,ignore_index=False)

    return growth_lexicon.corr()


### Revenue GI

In [293]:
revenue_growth_GI = correlation(revenue_growth,final_df,final_df['sentiment_index_GI'])
display(revenue_growth_GI)

Unnamed: 0,sentiment_index_GI,Q2_2018_growth,Q3_2018_growth,Q4_2018_growth,Q1_2019_growth
sentiment_index_GI,1.0,0.023247,-0.389937,-0.254369,-0.507808
Q2_2018_growth,0.023247,1.0,0.209715,-0.498003,0.108311
Q3_2018_growth,-0.389937,0.209715,1.0,0.564795,0.255667
Q4_2018_growth,-0.254369,-0.498003,0.564795,1.0,0.156819
Q1_2019_growth,-0.507808,0.108311,0.255667,0.156819,1.0


In [294]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_GI'],revenue_lexicon['Q2_2018_growth'])

(0.023246576729714342, 0.9319003120034389)

In [295]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_GI'],revenue_lexicon['Q3_2018_growth'])

(-0.3899371636974203, 0.13541766208939085)

In [296]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_GI'],revenue_lexicon['Q4_2018_growth'])

(-0.25436927747505156, 0.34175294398349065)

In [297]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_GI'],revenue_lexicon['Q1_2019_growth'])

(-0.5078076736864249, 0.04463085106575542)

### Revenue LM 

In [298]:
revenue_growth_LM = correlation(revenue_growth,final_df,final_df['sentiment_index_LM'])
display(revenue_growth_LM)

Unnamed: 0,sentiment_index_LM,Q2_2018_growth,Q3_2018_growth,Q4_2018_growth,Q1_2019_growth
sentiment_index_LM,1.0,-0.032036,0.148076,-0.040453,-0.083194
Q2_2018_growth,-0.032036,1.0,0.209715,-0.498003,0.108311
Q3_2018_growth,0.148076,0.209715,1.0,0.564795,0.255667
Q4_2018_growth,-0.040453,-0.498003,0.564795,1.0,0.156819
Q1_2019_growth,-0.083194,0.108311,0.255667,0.156819,1.0


In [299]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_LM'],revenue_lexicon['Q2_2018_growth'])

(-0.032035620327150655, 0.9062443271744839)

In [300]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_LM'],revenue_lexicon['Q3_2018_growth'])

(0.14807603429621952, 0.5841794535573959)

In [301]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_LM'],revenue_lexicon['Q4_2018_growth'])

(-0.04045250561469671, 0.8817555943242787)

In [302]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_LM'],revenue_lexicon['Q1_2019_growth'])

(-0.08319396132246978, 0.7593663585224005)

### Revenue ALL

In [266]:
revenue_growth_all = correlation(revenue_growth,final_df,final_df['sentiment_index_all'])
display(revenue_growth_all)

Unnamed: 0,sentiment_index_all,Q2_2018_growth,Q3_2018_growth,Q4_2018_growth,Q1_2019_growth
sentiment_index_all,1.0,0.361212,-0.319287,-0.486059,0.324142
Q2_2018_growth,0.361212,1.0,0.209715,-0.498003,0.108311
Q3_2018_growth,-0.319287,0.209715,1.0,0.564795,0.255667
Q4_2018_growth,-0.486059,-0.498003,0.564795,1.0,0.156819
Q1_2019_growth,0.324142,0.108311,0.255667,0.156819,1.0


In [303]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_all'],revenue_lexicon['Q2_2018_growth'])

(0.36121207607761713, 0.1692583199927423)

In [304]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_all'],revenue_lexicon['Q3_2018_growth'])

(-0.31928693229352223, 0.22804181907505777)

In [305]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_all'],revenue_lexicon['Q4_2018_growth'])

(-0.48605862037978864, 0.05627144301025644)

In [306]:
sp.stats.pearsonr(revenue_lexicon['sentiment_index_all'],revenue_lexicon['Q1_2019_growth'])

(0.3241417879798133, 0.22065029545268275)

# Correlation with Net Income

In [345]:
net_income_df['GrowthQ2 2018']=(net_income_df['Q2 2018']-net_income_df['Q1 2018'])/net_income_df['Q1 2018'].abs()
net_income_df['GrowthQ3 2018']=(net_income_df['Q3 2018']-net_income_df['Q1 2018'])/net_income_df['Q1 2018'].abs()
net_income_df['GrowthQ4 2018']=(net_income_df['Q4 2018']-net_income_df['Q1 2018'])/net_income_df['Q1 2018'].abs()
net_income_df['GrowthQ1 2019']=(net_income_df['Q1 2019']-net_income_df['Q1 2018'])/net_income_df['Q1 2018'].abs()
net_income_df

Unnamed: 0_level_0,Q1 2018,Q2 2018,Q3 2018,Q4 2018,Q1 2019,GrowthQ2 2018,GrowthQ3 2018,GrowthQ4 2018,GrowthQ1 2019
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AHT,150.0,190.1,150.5,115.9,209.9,0.267333,0.003333,-0.227333,0.399333
AZN,202.0,405.3,193.1,884.2,644.5,1.006436,-0.044059,3.377228,2.190594
BARC,824.41,1129.1,1135.1,148.0,1087.4,0.369586,0.376863,-0.820478,0.319004
BP,2046.2,2556.4,3042.3,1377.1,2392.2,0.24934,0.486805,-0.326996,0.169094
BTA,579.98,582.3,593.8,821.2,549.0,0.004,0.023828,0.415911,-0.053416
GSK,1090.0,1131.0,1552.0,1408.0,1339.0,0.037615,0.423853,0.291743,0.22844
HSBA,2946.0,3021.0,3134.8,1442.7,3274.8,0.025458,0.064087,-0.510285,0.111609
IAG,136.9,568.3,1013.1,437.6,61.1,3.151205,6.400292,2.196494,-0.553689
LLOY,1399.72,1352.6,1664.4,1201.0,1536.2,-0.033664,0.189095,-0.141971,0.097505
NWG,1013.13,897.1,1010.1,705.6,903.8,-0.114526,-0.002991,-0.303544,-0.107913


In [346]:
net_income_growth=net_income_df.iloc[:,5:]
net_income_growth = net_income_growth.fillna(0)

In [347]:
net_income_growth

Unnamed: 0_level_0,GrowthQ2 2018,GrowthQ3 2018,GrowthQ4 2018,GrowthQ1 2019
Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AHT,0.267333,0.003333,-0.227333,0.399333
AZN,1.006436,-0.044059,3.377228,2.190594
BARC,0.369586,0.376863,-0.820478,0.319004
BP,0.24934,0.486805,-0.326996,0.169094
BTA,0.004,0.023828,0.415911,-0.053416
GSK,0.037615,0.423853,0.291743,0.22844
HSBA,0.025458,0.064087,-0.510285,0.111609
IAG,3.151205,6.400292,2.196494,-0.553689
LLOY,-0.033664,0.189095,-0.141971,0.097505
NWG,-0.114526,-0.002991,-0.303544,-0.107913


In [348]:
net_income_growth_new = net_income_growth.drop(['SKY','SN'])

In [349]:
final_df_new = final_df.drop(['SKY','SN'])

In [350]:
netincome_lexicon = pd.concat([final_df_new,net_income_growth_new],axis=1)

In [351]:
netincome_lexicon

Unnamed: 0,sentiment_index_GI,sentiment_index_LM,sentiment_index_all,GrowthQ2 2018,GrowthQ3 2018,GrowthQ4 2018,GrowthQ1 2019
AHT,0.00172,-0.000587,0.071078,0.267333,0.003333,-0.227333,0.399333
AZN,-0.002639,0.000598,0.031524,1.006436,-0.044059,3.377228,2.190594
BARC,0.004274,-0.003712,0.030383,0.369586,0.376863,-0.820478,0.319004
BP,0.005113,0.000105,0.014848,0.24934,0.486805,-0.326996,0.169094
BTA,0.008141,-0.001853,0.065046,0.004,0.023828,0.415911,-0.053416
GSK,0.004143,0.002335,0.049581,0.037615,0.423853,0.291743,0.22844
HSBA,0.003844,-0.001288,0.036329,0.025458,0.064087,-0.510285,0.111609
IAG,0.003047,-0.003165,0.051538,3.151205,6.400292,2.196494,-0.553689
LLOY,0.003195,-0.003104,0.037485,-0.033664,0.189095,-0.141971,0.097505
NWG,0.004157,-0.003489,0.038805,-0.114526,-0.002991,-0.303544,-0.107913


### Net Income GI 

In [352]:
net_income_growth_GI = correlation(net_income_growth_new,final_df_new,final_df_new['sentiment_index_GI'])
net_income_growth_GI

Unnamed: 0,sentiment_index_GI,GrowthQ2 2018,GrowthQ3 2018,GrowthQ4 2018,GrowthQ1 2019
sentiment_index_GI,1.0,-0.177541,-0.106431,-0.217477,-0.454398
GrowthQ2 2018,-0.177541,1.0,0.770495,-0.076892,0.039415
GrowthQ3 2018,-0.106431,0.770495,1.0,0.378589,-0.47573
GrowthQ4 2018,-0.217477,-0.076892,0.378589,1.0,-0.169681
GrowthQ1 2019,-0.454398,0.039415,-0.47573,-0.169681,1.0


In [353]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_GI'],netincome_lexicon['GrowthQ2 2018'])

(-0.17754144382220016, 0.5437047930967208)

In [354]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_GI'],netincome_lexicon['GrowthQ3 2018'])

(-0.1064305281547591, 0.7172551648608864)

In [355]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_GI'],netincome_lexicon['GrowthQ4 2018'])

(-0.21747659827307414, 0.4551441727586243)

In [356]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_GI'],netincome_lexicon['GrowthQ1 2019'])

(-0.45439801793762935, 0.10261871156969997)

### Net Income LM 

In [362]:
net_income_growth_LM = correlation(net_income_growth_new,final_df_new,final_df_new['sentiment_index_LM'])
net_income_growth_LM

Unnamed: 0,sentiment_index_LM,GrowthQ2 2018,GrowthQ3 2018,GrowthQ4 2018,GrowthQ1 2019
sentiment_index_LM,1.0,-0.044605,-0.178728,-0.133188,0.412896
GrowthQ2 2018,-0.044605,1.0,0.770495,-0.076892,0.039415
GrowthQ3 2018,-0.178728,0.770495,1.0,0.378589,-0.47573
GrowthQ4 2018,-0.133188,-0.076892,0.378589,1.0,-0.169681
GrowthQ1 2019,0.412896,0.039415,-0.47573,-0.169681,1.0


In [358]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_LM'],netincome_lexicon['GrowthQ2 2018'])

(-0.04460527969686698, 0.8796515655345468)

In [359]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_LM'],netincome_lexicon['GrowthQ3 2018'])

(-0.17872839396729787, 0.5409701330957686)

In [360]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_LM'],netincome_lexicon['GrowthQ4 2018'])

(-0.13318769424208554, 0.6498921382947058)

In [361]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_LM'],netincome_lexicon['GrowthQ1 2019'])

(0.41289634261257013, 0.1422951399542949)

### Net Income ALL 

In [363]:
net_income_growth_all = correlation(net_income_growth_new,final_df_new,final_df_new['sentiment_index_all'])
net_income_growth_all

Unnamed: 0,sentiment_index_all,GrowthQ2 2018,GrowthQ3 2018,GrowthQ4 2018,GrowthQ1 2019
sentiment_index_all,1.0,0.280604,0.014357,-0.39701,0.009225
GrowthQ2 2018,0.280604,1.0,0.770495,-0.076892,0.039415
GrowthQ3 2018,0.014357,0.770495,1.0,0.378589,-0.47573
GrowthQ4 2018,-0.39701,-0.076892,0.378589,1.0,-0.169681
GrowthQ1 2019,0.009225,0.039415,-0.47573,-0.169681,1.0


In [364]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_all'],netincome_lexicon['GrowthQ2 2018'])

(0.2806042167834635, 0.3311743667977127)

In [365]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_all'],netincome_lexicon['GrowthQ3 2018'])

(0.014356861373753166, 0.961148875514807)

In [366]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_all'],netincome_lexicon['GrowthQ4 2018'])

(-0.3970097356171988, 0.1598590319974546)

In [367]:
sp.stats.pearsonr(netincome_lexicon['sentiment_index_all'],netincome_lexicon['GrowthQ1 2019'])

(0.009225004478183245, 0.9750311661791804)

# Finbert

In [368]:
each_list = df['document'].values.tolist()

In [369]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [370]:
inputs = tokenizer(each_list, padding = True, truncation = True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  2194,  2171,  ..., 14915,  2204,   102],
        [  101,  2194,  2171,  ...,  9006,  9739,   102],
        [  101,  2194,  2171,  ...,  2783,  4284,   102],
        ...,
        [  101,  2194,  2171,  ...,  5387,  1999,   102],
        [  101,  2194,  2171,  ...,  1048,  2078,   102],
        [  101,  2194,  2171,  ...,  2000,  3749,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}


In [371]:
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([16, 3])


In [372]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.6145, 0.0187, 0.3667],
        [0.0414, 0.0198, 0.9389],
        [0.7876, 0.1424, 0.0700],
        [0.0234, 0.0948, 0.8819],
        [0.0214, 0.0438, 0.9348],
        [0.0202, 0.0403, 0.9395],
        [0.8213, 0.0162, 0.1625],
        [0.9322, 0.0198, 0.0479],
        [0.5431, 0.0117, 0.4452],
        [0.0769, 0.8195, 0.1036],
        [0.0912, 0.0146, 0.8942],
        [0.1950, 0.0096, 0.7953],
        [0.0229, 0.0329, 0.9442],
        [0.8578, 0.0126, 0.1296],
        [0.0569, 0.1390, 0.8041],
        [0.0708, 0.0171, 0.9121]], grad_fn=<SoftmaxBackward0>)


In [373]:
positive = predictions[:, 0].tolist()
negative = predictions[:, 1].tolist()
neutral = predictions[:, 2].tolist()

table = {'Company':pdf_name,
         'Earning Call':each_list,
         "Positive":positive,
         "Negative":negative, 
         "Neutral":neutral}
      
df_finbert = pd.DataFrame(table, columns = ["Company","Earning Call", "Positive", "Negative", "Neutral"])

In [374]:
df_finbert

Unnamed: 0,Company,Earning Call,Positive,Negative,Neutral
0,AHT,company name ashtead market cap 8781 38027408 ...,0.614546,0.018714,0.36674
1,AZN,company name astrazeneca market cap 66384 3788...,0.041363,0.019752,0.938885
2,BARC,company name barclays market cap 35913 2007486...,0.787617,0.142427,0.069956
3,BP,company name bp market cap 109281 133456 bloom...,0.023357,0.094782,0.881861
4,BTA,company name bt market cap 30783 0234251 bloom...,0.021358,0.043849,0.934793
5,GSK,company name glaxo market cap 70030 4659254 bl...,0.020165,0.040328,0.939508
6,HSBA,company name hsbc market cap 140 569 05 bloomb...,0.82128,0.016178,0.162542
7,IAG,company name iag market cap 13893 0873112 bloo...,0.93224,0.01984,0.04792
8,LLOY,company name lloyds market cap 46801 5085506 b...,0.543061,0.011721,0.445218
9,NWG,company name natwest market cap 32276 2546622 ...,0.07685,0.819531,0.103619
