In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# rates manually assigned to each report by Rachel
manual_polarity = pd.read_excel('Src/manual_polarity.xlsx')
manual_polarity = manual_polarity[['name', 'rate']]

In [3]:
manual_polarity.head()

Unnamed: 0,name,rate
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55
3,FINAL_Q3_Letter,0.6
4,FINAL-Q1-18-Shareholder-Letter,0.7


### Open AI

In [4]:
# read openai polarity scores
openai_polarity = pd.read_csv('Scores/openai_polarity.csv')
# merge two dataframes on the basis of tweet_id
df = pd.merge(manual_polarity, openai_polarity, left_on='name', right_on="pdf_name", how='inner')
df.head()

Unnamed: 0,name,rate,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.75
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,Positive
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.4667
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.4
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.5


In [5]:
openai_polarity.shape

(50, 2)

In [6]:
df["polarity"] = df["polarity"].apply(lambda x: 0.65 if x == "Positive" else x)
df["polarity"] = df["polarity"].apply(lambda x: 0.60 if x == "Slightly positive" else x)
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"]) # find the correlation between OpenAI polarity and manual polarity

0.07654057092161448

### TextBlob

In [9]:
textblob_polarity = pd.read_csv("Scores/textblob_polarity.csv")

df = pd.merge(manual_polarity, textblob_polarity, left_on='name', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"]) # find the correlation between TextBlob polarity and manual polarity

0.1456781225852711

In [10]:
df.shape

(50, 5)

In [11]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.966
Model:                            OLS   Adj. R-squared (uncentered):              0.965
Method:                 Least Squares   F-statistic:                              1384.
Date:                Mon, 21 Aug 2023   Prob (F-statistic):                    1.38e-37
Time:                        00:56:02   Log-Likelihood:                          35.430
No. Observations:                  50   AIC:                                     -68.86
Df Residuals:                      49   BIC:                                     -66.95
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Amazon

In [21]:
polarity_amazon = pd.read_csv('Scores/amazon_polarity.csv')

df = pd.merge(manual_polarity, polarity_amazon, left_on='name', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"])

0.23833402723200006

In [22]:
df.shape

(49, 4)

In [23]:
df.head()

Unnamed: 0,name,rate,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.905803
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.807176
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.765612
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.769323
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.874805


In [24]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.986
Model:                            OLS   Adj. R-squared (uncentered):              0.986
Method:                 Least Squares   F-statistic:                              3374.
Date:                Mon, 21 Aug 2023   Prob (F-statistic):                    3.87e-46
Time:                        01:21:53   Log-Likelihood:                          56.244
No. Observations:                  49   AIC:                                     -110.5
Df Residuals:                      48   BIC:                                     -108.6
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Google 

In [14]:
polarity_google = pd.read_csv('Scores/google_polarity.csv')

df = pd.merge(manual_polarity, polarity_google, left_on='name', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
df["polarity"].corr(df["rate"])

0.3018331223455835

In [15]:
df.shape

(50, 4)

In [16]:
df.head()

Unnamed: 0,name,rate,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.2
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.1
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.2
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.2


In [17]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.866
Model:                            OLS   Adj. R-squared (uncentered):              0.864
Method:                 Least Squares   F-statistic:                              317.9
Date:                Mon, 21 Aug 2023   Prob (F-statistic):                    4.59e-23
Time:                        00:56:26   Log-Likelihood:                          1.3654
No. Observations:                  50   AIC:                                    -0.7309
Df Residuals:                      49   BIC:                                      1.181
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### BART 

In [18]:
polarity_bart = pd.read_csv('Src/bart_positive_sentiment_probs.csv')

df = pd.merge(manual_polarity, polarity_bart, left_on='name', right_on="pdf_name", how='inner')
df["positive_sentiment_prob"] = df["positive_sentiment_prob"].apply(lambda x: 0.5 if x == "Positive" else x)
df["positive_sentiment_prob"] = pd.to_numeric(df["positive_sentiment_prob"])
df.dropna(inplace=True)
df["positive_sentiment_prob"].corr(df["rate"])

0.2141413668331887

In [19]:
df.head()

Unnamed: 0,name,rate,pdf_name,positive_sentiment_prob
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.7,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.538636
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.6,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.539246
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.55,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.539036
3,FINAL_Q3_Letter,0.6,FINAL_Q3_Letter,0.539802
4,FINAL-Q1-18-Shareholder-Letter,0.7,FINAL-Q1-18-Shareholder-Letter,0.539028


In [20]:
df.shape

(50, 4)