In [18]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Hi Carl,

## I chose not to use the functions because I wanted to calculate R^2 from scratch to do some sanity checks. My result seems quite different, so could you kindly go over this?

In [4]:
from daily_stock_price import get_stock_prices
# from supervised_data_prep import pre_process_data
# from supervised_modeling import model_comparison

# identify the ticker for the selected stock / index
# list feature groups to be used as training data for the model comparison
TICKER = 'GME'               
FEATURES = [['Core, influential Redditors:Diamond Hands, Trading GME'],                 
            ['Highly Controversial:Diamond Hands, Trading GME'],
            ['Core, influential Redditors:Posts about shorts/short squeeze'],
            ['Highly Controversial:Posts about shorts/short squeeze'],
            ['all']]

# retrive training and test reddit data for model comparison and scoring (note: validation set is not included in data below, data > Sept30th)
df_communites = pd.read_csv('../koigawa_milestone_II/community_output_gme_train.csv')
df_topics = pd.read_csv('../smoilanen_milestone_II/df_train.csv')
#print(df_communites['community_label_str'].unique())
#print(df_topics['topic'].unique())

# retrive stock prices for the same time period as above
MIN_DATE = df_communites['date'].min()[:10]
MAX_DATE = df_communites['date'].max()[:10]
df_prices = get_stock_prices(TICKER, start=MIN_DATE, end=MAX_DATE)



In [5]:

# Just converting into datetime
df_communites['date'] = pd.to_datetime(df_communites['date'])

df_communites['date'] = df_communites['date'].dt.date

## You can see that there are now four groups of communities identified

In [8]:
df_communites.groupby(['community_label_str']).count()

Unnamed: 0_level_0,Unnamed: 0,id,author,date,selftext,community_label
community_label_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Core, influential Redditors",5461,5461,5461,5461,3238,5461
Highly controversial,4255,4255,4255,4255,2558,4255
Scriptophobic,47087,47087,47087,47087,11062,47087
Unpopular,79329,79329,79329,79329,52662,79329


## Below is kind of important. So I want to combine all the features in a way that we can see if community alone, topics alone, or topics plus community work best. In the following section you will see me do feature selection, but it might be worth exploring how much, say, topics alone can perform. FYI, community data alone will result in R2 of around 0.3+

In [9]:
merged_df = pd.merge(df_communites,df_topics,on='id',how='inner')[['date','community_label_str','topic']]

merged_df['combined'] = merged_df['community_label_str'] + ' community talking about ' + merged_df['topic']


merged_df.head(5)

Unnamed: 0,date,community_label_str,topic,combined
0,2021-01-04,Unpopular,Unclear Topic - mix between future of gamestop...,Unpopular community talking about Unclear Topi...
1,2021-01-05,Unpopular,"GME Observations (Price, Volume, Holders, etc.)",Unpopular community talking about GME Observat...
2,2021-01-10,Unpopular,Unclear Topic - lots of Twitter Links,Unpopular community talking about Unclear Topi...
3,2021-01-16,Unpopular,Posts about shorts/short squeeze,Unpopular community talking about Posts about ...
4,2021-01-19,Unpopular,Posts about shorts/short squeeze,Unpopular community talking about Posts about ...


## Below just combines all the files by total count by date

In [10]:
df_communites_count = merged_df.groupby(['date', 'community_label_str']).size().reset_index(name='count_community')

df_topic_count = merged_df.groupby(['date', 'topic']).size().reset_index(name='count_topic')

df_combined_count = merged_df.groupby(['date', 'combined']).size().reset_index(name='count_combined')

## By the way, you are completely right. Log Return does make the result much worse. This just means that we can't use Reddit posts to make day by day change prediction. Instead it can only be used to determine long-term performance. This kind of makes sense.

In [11]:
df_communites_pivot = df_communites_count.pivot(index='date', columns='community_label_str', values='count_community').reset_index()
df_communites_pivot = df_communites_pivot.fillna(0)
df_communites_pivot = df_communites_pivot.set_index('date')

df_topic_pivot = df_topic_count.pivot(index='date', columns='topic', values='count_topic').reset_index()
df_topic_pivot = df_topic_pivot.fillna(0)
df_topic_pivot = df_topic_pivot.set_index('date')

df_combined_pivot = df_combined_count.pivot(index='date', columns='combined', values='count_combined').reset_index()
df_combined_pivot= df_combined_pivot.fillna(0)
df_combined_pivot = df_combined_pivot.set_index('date')

df_prices['log_return'] = np.log(df_prices['GME'] / df_prices['GME'].shift(1))
df_prices = df_prices.fillna(0)
df_prices = df_prices.set_index('date')

## Lots, and lots, and lots of features

In [12]:
df_all = pd.merge(df_prices,df_communites_pivot,left_index=True,right_index=True,how='inner')
df_all = pd.merge(df_all,df_topic_pivot,left_index=True,right_index=True,how='inner')
df_all = pd.merge(df_all,df_combined_pivot,left_index=True,right_index=True,how='inner')
df_all

Unnamed: 0_level_0,GME,log_return,"Core, influential Redditors",Highly controversial,Scriptophobic,Unpopular,"Diamond Hands, Trading GME","GME Observations (Price, Volume, Holders, etc.)","MOD Posts, Many Links",News/Press Releases,...,"Unpopular community talking about Diamond Hands, Trading GME","Unpopular community talking about GME Observations (Price, Volume, Holders, etc.)","Unpopular community talking about MOD Posts, Many Links",Unpopular community talking about News/Press Releases,Unpopular community talking about Platform Terms/Government Regulations,Unpopular community talking about Posts about shorts/short squeeze,Unpopular community talking about Posts of Memes and Photos,Unpopular community talking about Unclear Topic - lots of Twitter Links,"Unpopular community talking about Unclear Topic - mix between future of gamestop, wealth, others",Unpopular community talking about Users Commenting on Other Posts
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04,4.312500,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2021-01-05,4.342500,0.006932,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-06,4.590000,0.055430,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-12,4.987500,0.000501,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-14,9.977500,0.239819,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-24,45.002499,0.018050,0.0,6.0,11.0,36.0,13.0,2.0,3.0,1.0,...,9.0,2.0,3.0,1.0,1.0,1.0,2.0,4.0,7.0,6.0
2021-05-25,52.357498,0.151377,4.0,9.0,24.0,110.0,33.0,10.0,5.0,7.0,...,28.0,8.0,4.0,5.0,7.0,8.0,8.0,2.0,14.0,26.0
2021-05-26,60.639999,0.146860,4.0,8.0,22.0,83.0,33.0,3.0,1.0,6.0,...,25.0,1.0,1.0,4.0,2.0,6.0,11.0,3.0,9.0,21.0
2021-05-27,63.532501,0.046597,1.0,5.0,22.0,62.0,18.0,6.0,1.0,3.0,...,12.0,4.0,1.0,2.0,4.0,11.0,4.0,0.0,9.0,15.0


## Define X and Y

In [13]:
y = df_all['GME'].values
X = df_all.copy().drop(columns=['GME','log_return'])

## Okay, so below is doing a lot of things to simplify

1. It standardizes the X and Y values
2. Performs normal regression
3. Based on the normal regression, it takes the coefficients and sorts the columns of features based on the correlation that the model has found - in theory this will help us get the most important features
4. Then, we can see how much adding the features will impact R2 and MSE - where we start adding the most important features first. In the end, we can set some cut off at which the features stop adding value.
5. Records MSE and R2 for Lasso, Ridge, and regular regression
6. Records counts of features used by Lasso - remember that it is a feature selection technique

In [20]:
scaler = StandardScaler()

X_transformed = scaler.fit_transform(X)
y_transformed = scaler.fit_transform(y.reshape(-1, 1))

model = LinearRegression()
model.fit(X_transformed, y_transformed)

# Make predictions
y_pred = model.predict(X_transformed)

# Evaluate the model
mse = mean_squared_error(y_transformed, y_pred)
r2 = r2_score(y_transformed, y_pred)

col_order = np.argsort(abs(model.coef_))
col_order_desc = col_order[0][::-1]

r2_scores_standard = []
r2_scores_ridge = []
r2_scores_lasso = []

mse_scores_standard = []
mse_scores_ridge = []
mse_scores_lasso = []

feature_count = []
feature_count_lasso = []

coef_ranked_cols = [X.columns.values[i] for i in col_order_desc]

for i in np.arange(len(coef_ranked_cols)):
    
    X_temp = X[coef_ranked_cols[:i+1]]
        
    X_temp_transformed = scaler.fit_transform(X_temp)
    
    model_standard = LinearRegression()   
    model_ridge = Ridge(alpha=0.9)
    model_lasso = Lasso(alpha=0.01)    
    
    model_standard.fit(X_temp_transformed, y_transformed)
    model_ridge.fit(X_temp_transformed, y_transformed)
    model_lasso.fit(X_temp_transformed, y_transformed)
    
    y_pred_standard = model_standard.predict(X_temp_transformed)
    y_pred_ridge = model_ridge.predict(X_temp_transformed)
    y_pred_lasso = model_lasso.predict(X_temp_transformed)

    #print(y_pred_lasso)
    mse_standard = mean_squared_error(y_transformed, y_pred_standard)
    r2_standard = r2_score(y_transformed, y_pred_standard)
    
    mse_ridge = mean_squared_error(y_transformed, y_pred_ridge)
    r2_ridge = r2_score(y_transformed, y_pred_ridge)
    
    mse_lasso = mean_squared_error(y_transformed, y_pred_lasso)
    r2_lasso = r2_score(y_transformed, y_pred_lasso)
    
    coefficients_lasso = model_lasso.coef_
    retained_features = [X_temp.columns[i] for i in range(len(coefficients_lasso)) if coefficients_lasso[i] != 0]
    
    r2_scores_standard.append(r2_standard)
    r2_scores_ridge.append(r2_ridge)
    r2_scores_lasso.append(r2_lasso)
    
    mse_scores_standard.append(mse_standard)
    mse_scores_ridge.append(mse_ridge)
    mse_scores_lasso.append(mse_lasso)
    
    feature_count.append(i)
    feature_count_lasso.append(len(retained_features))
    
    if i == len(coef_ranked_cols) - 1:
        print(retained_features)
    
df_viz = pd.DataFrame({'r2_scores_standard':r2_scores_standard,
                       'r2_scores_ridge':r2_scores_ridge,
                       'r2_scores_lasso':r2_scores_lasso,
                       'mse_scores_standard':mse_scores_standard,
                       'mse_scores_ridge':mse_scores_ridge,
                       'mse_scores_lasso':mse_scores_lasso,
                       'feature_count': feature_count,
                       'feature_count_lasso':feature_count_lasso
                      })


df_viz

['Scriptophobic community talking about Unclear Topic - mix between future of gamestop, wealth, others', 'Unpopular community talking about Diamond Hands, Trading GME', 'Core, influential Redditors community talking about Users Commenting on Other Posts', 'Unpopular community talking about GME Observations (Price, Volume, Holders, etc.)', 'Highly controversial community talking about Unclear Topic - mix between future of gamestop, wealth, others', 'Scriptophobic community talking about Posts about shorts/short squeeze', 'Scriptophobic community talking about News/Press Releases', 'Core, influential Redditors community talking about GME Observations (Price, Volume, Holders, etc.)', 'Highly controversial community talking about GME Observations (Price, Volume, Holders, etc.)', 'Scriptophobic community talking about MOD Posts, Many Links', 'Core, influential Redditors community talking about Diamond Hands, Trading GME', 'Scriptophobic community talking about Unclear Topic - lots of Twitte

Unnamed: 0,r2_scores_standard,r2_scores_ridge,r2_scores_lasso,mse_scores_standard,mse_scores_ridge,mse_scores_lasso,feature_count,feature_count_lasso
0,0.052993,0.052989,0.052893,0.947007,0.947011,0.947107,0,1
1,0.092215,0.089187,0.082294,0.907785,0.910813,0.917706,1,2
2,0.093191,0.090679,0.081487,0.906809,0.909321,0.918513,2,3
3,0.114421,0.092496,0.081487,0.885579,0.907504,0.918513,3,3
4,0.119428,0.098624,0.087133,0.880572,0.901376,0.912867,4,4
5,0.125758,0.101585,0.08878,0.874242,0.898415,0.91122,5,5
6,0.140952,0.105223,0.089392,0.859048,0.894777,0.910608,6,6
7,0.249376,0.203887,0.198231,0.750624,0.796113,0.801769,7,5
8,0.293164,0.246517,0.241444,0.706836,0.753483,0.758556,8,6
9,0.29585,0.247166,0.242473,0.70415,0.752834,0.757527,9,6


## You can see that we reach around 0.4 on 16 features. Honestly, our conclusion is that there isn't a strong relationship but it's interesting nevertheless

In [21]:

base = alt.Chart(df_viz)

c1 = base.mark_line(color='blue').encode(
    x='feature_count',
    y='r2_scores_standard:Q'
)

c2 = base.mark_line(color='red').encode(
    x='feature_count',
    y='r2_scores_ridge:Q'
)
 

c3 = base.mark_line(color='green').encode(
    x='feature_count',
    y='r2_scores_lasso:Q'
)
 
(c1 + c2 + c3).properties(height=400,width=800)