In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# rates manually assigned to each report by Rachel
manual_polarity = pd.read_excel('Src/manual_polarity.xlsx')
manual_polarity = manual_polarity[['name', 'rate']]

In [3]:
manual_polarity.head()

Unnamed: 0,name,rate
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55
3,FINAL_Q3_Letter,0.6
4,FINAL-Q1-18-Shareholder-Letter,0.7


### Open AI

In [4]:
# read openai polarity scores
openai_polarity = pd.read_csv('Scores/openai_polarity.csv')
# merge two dataframes on the basis of tweet_id
df = pd.merge(manual_polarity, openai_polarity, left_on='name', right_on="pdf_name", how='inner')
df.head()

Unnamed: 0,name,rate,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.75
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,Positive
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.4667
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.4
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.5


In [5]:
openai_polarity.shape

(50, 2)

In [6]:
df["polarity"] = df["polarity"].apply(lambda x: 0.65 if x == "Positive" else x)
df["polarity"] = df["polarity"].apply(lambda x: 0.60 if x == "Slightly positive" else x)
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"]) # find the correlation between OpenAI polarity and manual polarity

0.07654057092161448

### TextBlob

In [103]:
textblob_polarity = pd.read_csv("Scores/textblob_polarity.csv")

df = pd.merge(manual_polarity, polarity_textblob, left_on='name', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"]) # find the correlation between TextBlob polarity and manual polarity

0.22257150610438023

In [85]:
df.shape

(50, 5)

In [104]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.958
Model:                            OLS   Adj. R-squared (uncentered):              0.957
Method:                 Least Squares   F-statistic:                              1105.
Date:                Sat, 19 Aug 2023   Prob (F-statistic):                    2.79e-35
Time:                        11:31:30   Log-Likelihood:                          30.020
No. Observations:                  50   AIC:                                     -58.04
Df Residuals:                      49   BIC:                                     -56.13
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Amazon

In [119]:
polarity_amazon = pd.read_csv('Scores/amazon_polarity.csv')

df = pd.merge(manual_polarity, polarity_amazon, left_on='name', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"])

-0.03792775640146495

In [120]:
df.shape

(42, 4)

In [109]:
df.head()

Unnamed: 0,name,rate,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.883941
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.897385
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.867095
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.881112
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.870285


In [91]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.984
Model:                            OLS   Adj. R-squared (uncentered):              0.983
Method:                 Least Squares   F-statistic:                              2460.
Date:                Sat, 19 Aug 2023   Prob (F-statistic):                    3.14e-38
Time:                        11:24:18   Log-Likelihood:                          45.006
No. Observations:                  42   AIC:                                     -88.01
Df Residuals:                      41   BIC:                                     -86.27
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Google 

In [116]:
polarity_google = pd.read_csv('Scores/google_polarity.csv')

df = pd.merge(manual_polarity, polarity_google, left_on='name', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"])

0.335086967891047

In [118]:
df.shape

(50, 4)

In [117]:
df.head()

Unnamed: 0,name,rate,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.2
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.1
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.2
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.2


In [96]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.808
Model:                            OLS   Adj. R-squared (uncentered):              0.804
Method:                 Least Squares   F-statistic:                              206.1
Date:                Sat, 19 Aug 2023   Prob (F-statistic):                    3.50e-19
Time:                        11:24:36   Log-Likelihood:                         -7.7225
No. Observations:                  50   AIC:                                      17.44
Df Residuals:                      49   BIC:                                      19.36
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### BART 

In [113]:
polarity_bart = pd.read_csv('Src/bart_positive_sentiment_probs.csv')

df = pd.merge(manual_polarity, polarity_bart, left_on='name', right_on="pdf_name", how='inner')
df["positive_sentiment_prob"] = df["positive_sentiment_prob"].apply(lambda x: 0.5 if x == "Positive" else x)
df["positive_sentiment_prob"] = pd.to_numeric(df["positive_sentiment_prob"])
df.dropna(inplace=True)
df["positive_sentiment_prob"].corr(df["rate"])

0.2141413668331887

In [114]:
df.head()

Unnamed: 0,name,rate,pdf_name,positive_sentiment_prob
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.538636
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.539246
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.539036
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.539802
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.539028


In [115]:
df.shape

(50, 4)