In [211]:
# import libraries
import pandas as pd
import numpy as np

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [1]:
# rates manually assigned to each shareholder letter
manual_polarity = pd.read_excel('Src/Manual_Scores.xlsx')

NameError: name 'pd' is not defined

In [193]:
manual_polarity.head()

Unnamed: 0,ReportName,MyReportName,AverageManualScore
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667
3,FINAL_Q3_Letter,FINAL_Q3_Letter,0.483333
4,FINAL-Q1-18-Shareholder-Letter,FINAL-Q1-18-Shareholder-Letter,0.5


In [209]:
models_dict = {
    'TextBlob': "Scores/textblob_polarity.csv",
    'Amazon': "Scores/amazon_polarity.csv",
    'Google': "Scores/google_polarity.csv",
    'OpenAI': "Scores/OpenAI.csv",
}

for model, path in models_dict.items():

    print(f"{model} Results:")
    
    df = pd.read_csv(path)
    df = pd.merge(manual_polarity, df, left_on='MyReportName', right_on="pdf_name", how='inner')
    df["polarity"] = pd.to_numeric(df["polarity"])
    df.dropna(inplace=True)

    # apply min max scaling to the polarity scores
    scaler = MinMaxScaler(feature_range=(df.AverageManualScore.min(), df.AverageManualScore.max()))
    df['predicted_scaled_scores'] = scaler.fit_transform(df[['polarity']])
    # calculate the error metrics
    predicted = df["predicted_scaled_scores"]
    actual = df["AverageManualScore"]

    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    corr_rate = np.corrcoef(actual, predicted)[0, 1]

    print(f"Correlation between {model} polarity and manual polarity: ", corr_rate)
    print("MAE:", mae)
    print("MSE:", mse)
    print(" ")

TextBlob Results:
Correlation between TextBlob polarity and manual polarity:  0.1938667056272002
MAE: 0.17062031608790346
MSE: 0.04329156851574009
 
Amazon Results:
Correlation between Amazon polarity and manual polarity:  -0.0019645297253857233
MAE: 0.1850452771179042
MSE: 0.06256966925207241
 
Google Results:
Correlation between Google polarity and manual polarity:  0.4822743596641784
MAE: 0.1452222257449557
MSE: 0.03288487713266478
 
OpenAI Results:
Correlation between OpenAI polarity and manual polarity:  0.45084574312574344
MAE: 0.15056689342578236
MSE: 0.045976631392369016
 


### Separate Analysis

### Open AI

In [194]:
# read openai polarity scores
# openai_polarity = pd.read_csv('Scores/openai_polarity.csv')
openai_polarity = pd.read_csv('Scores/OpenAI.csv')
# merge two dataframes on the basis of tweet_id
df = pd.merge(manual_polarity, openai_polarity, left_on='MyReportName', right_on="Report", how='inner')
# df["polarity"] = df["polarity"].apply(lambda x: 0.65 if x == "Positive" else x)
# df["polarity"] = df["polarity"].apply(lambda x: 0.60 if x == "Slightly positive" else x)
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
corr_rate = df["polarity"].corr(df["AverageManualScore"]) # find the correlation between OpenAI polarity and manual polarity
df.shape

(49, 5)

In [195]:
# apply min max scaling to the polarity scores
scaler = MinMaxScaler(feature_range=(df.AverageManualScore.min(), df.AverageManualScore.max()))
df['predicted_scaled_scores'] = scaler.fit_transform(df[['polarity']])

# Calculate the mean of the actual values
predicted = df["predicted_scaled_scores"]
actual = df["AverageManualScore"]

mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae) # 0.2307 (with StandardScaler 0.886)
print("MSE:", mse) # 0.1022 (with MSE 1.14)
print("Correlation between OpenAI polarity and manual polarity: ", corr_rate)

MAE: 0.15056689342578236
MSE: 0.045976631392369016
Correlation between OpenAI polarity and manual polarity:  0.4508457431257436


### TextBlob

In [177]:
textblob_polarity = pd.read_csv("Scores/textblob_polarity.csv")

df = pd.merge(manual_polarity, textblob_polarity, left_on='Report', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
corr_rate = df["polarity"].corr(df["AverageManualScore"]) # find the correlation between TextBlob polarity and manual polarity

In [178]:
# apply min max scaling to the polarity scores
scaler = MinMaxScaler(feature_range=(df.AverageManualScore.min(), df.AverageManualScore.max()))
df['predicted_scaled_scores'] = scaler.fit_transform(df[['polarity']])

# Calculate the mean of the actual values
predicted = df["predicted_scaled_scores"]
actual = df["AverageManualScore"]

# Compute R^2
r2 = r2_score(actual, predicted)
mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae) # 0.2307 (with StandardScaler 0.886)
print("MSE:", mse) # 0.1022 (with MSE 1.14)
print("Correlation between TextBlob polarity and manual polarity: ", corr_rate)

MAE: 0.16107475526787646
MSE: 0.04095232160014455
Correlation between TextBlob polarity and manual polarity:  0.30989193699837503


### Amazon

In [180]:
polarity_amazon = pd.read_csv('Scores/amazon_polarity.csv')

df = pd.merge(manual_polarity, polarity_amazon, left_on='Report', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
corr_rate = df["polarity"].corr(df["AverageManualScore"])

In [181]:
# apply min max scaling to the polarity scores
scaler = MinMaxScaler(feature_range=(df.AverageManualScore.min(), df.AverageManualScore.max()))
df['predicted_scaled_scores'] = scaler.fit_transform(df[['polarity']])

# Calculate the mean of the actual values
predicted = df["predicted_scaled_scores"]
actual = df["AverageManualScore"]

# Compute R^2
r2 = r2_score(actual, predicted)
mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae) # 0.2307 (with StandardScaler 0.886)
print("MSE:", mse) # 0.1022 (with MSE 1.14)
print("Correlation between TextBlob polarity and manual polarity: ", corr_rate)

MAE: 0.19964732480591216
MSE: 0.07118832977240773
Correlation between TextBlob polarity and manual polarity:  -0.016412256160416847


In [83]:
df.shape

(44, 4)

In [84]:
df.head()

Unnamed: 0,Report,AverageManualScore,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.905803
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.807176
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.765612
3,FINAL_Q3_Letter,0.483333,FINAL_Q3_Letter,0.769323
4,FINAL-Q1-18-Shareholder-Letter,0.5,FINAL-Q1-18-Shareholder-Letter,0.874805


In [85]:
# apply min max scaling to the polarity scores
scaler = MinMaxScaler()
df['polarity_minmaxscale'] = scaler.fit_transform(df[['polarity']])
scaler = MinMaxScaler()
df['averagemanualscore_minmax_scale'] = scaler.fit_transform(df[['AverageManualScore']])
df.head()

Unnamed: 0,Report,AverageManualScore,pdf_name,polarity,polarity_minmaxscale,averagemanualscore_minmax_scale
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.905803,1.0,0.87234
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.807176,0.544701,0.531915
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.765612,0.352828,0.319149
3,FINAL_Q3_Letter,0.483333,FINAL_Q3_Letter,0.769323,0.369959,0.723404
4,FINAL-Q1-18-Shareholder-Letter,0.5,FINAL-Q1-18-Shareholder-Letter,0.874805,0.856901,0.744681


In [86]:
# Calculate the mean of the actual values
predicted = df["polarity_minmaxscale"]
actual = df["averagemanualscore_minmax_scale"]

# Compute R^2
r2 = r2_score(actual, predicted)
mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae)
print("MSE:", mse)
print("R^2:", r2)

MAE: 0.25486892528522775
MSE: 0.11601538577765905
R^2: -0.9534475240211968


In [None]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

### Google 

In [182]:
polarity_google = pd.read_csv('Scores/google_polarity.csv')

df = pd.merge(manual_polarity, polarity_google, left_on='Report', right_on="pdf_name", how='inner')
df["polarity"] = pd.to_numeric(df["polarity"])
df.dropna(inplace=True)
corr_rate = df["polarity"].corr(df["AverageManualScore"])

In [183]:
# apply min max scaling to the polarity scores
scaler = MinMaxScaler(feature_range=(df.AverageManualScore.min(), df.AverageManualScore.max()))
df['predicted_scaled_scores'] = scaler.fit_transform(df[['polarity']])

# Calculate the mean of the actual values
predicted = df["predicted_scaled_scores"]
actual = df["AverageManualScore"]

# Compute R^2
r2 = r2_score(actual, predicted)
mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae) # 0.2307 (with StandardScaler 0.886)
print("MSE:", mse) # 0.1022 (with MSE 1.14)
print("Correlation between TextBlob polarity and manual polarity: ", corr_rate)

MAE: 0.15025252686914561
MSE: 0.03488040190305624
Correlation between TextBlob polarity and manual polarity:  0.49626973654879064


In [94]:
df.shape

(44, 4)

In [95]:
df.head()

Unnamed: 0,Report,AverageManualScore,pdf_name,polarity
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.2
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.1
3,FINAL_Q3_Letter,0.483333,FINAL_Q3_Letter,0.2
4,FINAL-Q1-18-Shareholder-Letter,0.5,FINAL-Q1-18-Shareholder-Letter,0.2


In [96]:
# apply min max scaling to the polarity scores

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['polarity_minmaxscale'] = scaler.fit_transform(df[['polarity']])
scaler = MinMaxScaler()
df['averagemanualscore_minmax_scale'] = scaler.fit_transform(df[['AverageManualScore']])
df.head()

Unnamed: 0,Report,AverageManualScore,pdf_name,polarity,polarity_minmaxscale,averagemanualscore_minmax_scale
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.2,0.666667,0.87234
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.2,0.666667,0.531915
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.1,0.333333,0.319149
3,FINAL_Q3_Letter,0.483333,FINAL_Q3_Letter,0.2,0.666667,0.723404
4,FINAL-Q1-18-Shareholder-Letter,0.5,FINAL-Q1-18-Shareholder-Letter,0.2,0.666667,0.744681


In [97]:
# Calculate the mean of the actual values
predicted = df["polarity_minmaxscale"]
actual = df["averagemanualscore_minmax_scale"]

# Compute R^2
r2 = r2_score(actual, predicted)
mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae)
print("MSE:", mse)
print("R^2:", r2)

MAE: 0.1918117364295128
MSE: 0.0568444757139299
R^2: 0.04286229284734422


In [49]:
from sklearn.linear_model import LinearRegression

x = df[['polarity_minmaxscale']]
y = df['averagemanualscore_minmax_scale']

# Train a simple linear regression model
model = LinearRegression().fit(x, y)

# Get R^2
r2 = model.score(x, y)

print("R^2:", r2)

R^2: 0.24628365141420605


In [41]:
# calculate R squared value for min max scaled polarity scores

import statsmodels.api as sm

x = df[['polarity_minmaxscale']]

y = df['averagemanualscore_minmax_scale']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()

predictions = model.predict(x)

print_model = model.summary()
print(print_model)



                                       OLS Regression Results                                       
Dep. Variable:     averagemanualscore_minmax_scale   R-squared (uncentered):                   0.847
Model:                                         OLS   Adj. R-squared (uncentered):              0.844
Method:                              Least Squares   F-statistic:                              238.2
Date:                             Wed, 23 Aug 2023   Prob (F-statistic):                    3.83e-19
Time:                                     13:50:42   Log-Likelihood:                         0.99475
No. Observations:                               44   AIC:                                    0.01050
Df Residuals:                                   43   BIC:                                      1.795
Df Model:                                        1                                                  
Covariance Type:                         nonrobust                                         

In [33]:
# export the dataframe to xlsx

df.to_excel('Scores/google_api_model.xlsx', index=False)

In [17]:
import statsmodels.api as sm

x = df[['polarity']]
y = df['rate']

# x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
predictions = model.predict(x) 

print_model = model.summary()
print(print_model)

                                 OLS Regression Results                                
Dep. Variable:                   rate   R-squared (uncentered):                   0.866
Model:                            OLS   Adj. R-squared (uncentered):              0.864
Method:                 Least Squares   F-statistic:                              317.9
Date:                Mon, 21 Aug 2023   Prob (F-statistic):                    4.59e-23
Time:                        00:56:26   Log-Likelihood:                          1.3654
No. Observations:                  50   AIC:                                    -0.7309
Df Residuals:                      49   BIC:                                      1.181
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### BART 

In [103]:
polarity_bart = pd.read_csv('Src/bart_positive_sentiment_probs.csv')

df = pd.merge(manual_polarity, polarity_bart, left_on='Report', right_on="pdf_name", how='inner')
df["positive_sentiment_prob"] = df["positive_sentiment_prob"].apply(lambda x: 0.5 if x == "Positive" else x)
df["positive_sentiment_prob"] = pd.to_numeric(df["positive_sentiment_prob"])
df.dropna(inplace=True)
df["positive_sentiment_prob"].corr(df["AverageManualScore"])

0.18240878308212977

In [104]:
# apply min max scaling to the polarity scores

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['polarity_minmaxscale'] = scaler.fit_transform(df[['positive_sentiment_prob']])
scaler = MinMaxScaler()
df['averagemanualscore_minmax_scale'] = scaler.fit_transform(df[['AverageManualScore']])
df.head()

Unnamed: 0,Report,AverageManualScore,pdf_name,positive_sentiment_prob,polarity_minmaxscale,averagemanualscore_minmax_scale
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.538636,0.570633,0.87234
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.539246,0.681668,0.531915
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.539036,0.643519,0.319149
3,FINAL_Q3_Letter,0.483333,FINAL_Q3_Letter,0.539802,0.782959,0.723404
4,FINAL-Q1-18-Shareholder-Letter,0.5,FINAL-Q1-18-Shareholder-Letter,0.539028,0.641955,0.744681


In [105]:
# Calculate the mean of the actual values
predicted = df["polarity_minmaxscale"]
actual = df["averagemanualscore_minmax_scale"]

# Compute R^2
r2 = r2_score(actual, predicted)
mae = mean_absolute_error(actual, predicted)
mse = mean_squared_error(actual, predicted)

print("MAE:", mae)
print("MSE:", mse)
print("R^2:", r2)

MAE: 0.24623830801969884
MSE: 0.09318467852776129
R^2: -0.5690279209658071


In [101]:
df.head()

Unnamed: 0,Report,AverageManualScore,pdf_name,positive_sentiment_prob
0,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.6,COMBINED-Q4-17-Shareholder-Letter-FINAL,0.538636
1,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.333333,FINAL_Q2_15_Letter_to_Shareholders_With_Tables_,0.539246
2,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.166667,FINAL_Q3_15_Letter_to_Shareholders_With_Tables_,0.539036
3,FINAL_Q3_Letter,0.483333,FINAL_Q3_Letter,0.539802
4,FINAL-Q1-18-Shareholder-Letter,0.5,FINAL-Q1-18-Shareholder-Letter,0.539028


In [20]:
df.shape

(50, 4)