In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit,learning_curve
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import altair as alt

In [2]:
from daily_stock_price import get_stock_prices
# from supervised_data_prep import pre_process_data
# from supervised_modeling import model_comparison

# identify the ticker for the selected stock / index
# list feature groups to be used as training data for the model comparison
TICKER = 'GME'               


# retrive training and test reddit data for model comparison and scoring (note: validation set is not included in data below, data > Sept30th)
df_communites = pd.read_csv('../koigawa_milestone_II/community_output_gme_train.csv')
df_topics = pd.read_csv('../smoilanen_milestone_II/df_train.csv')
#print(df_communites['community_label_str'].unique())
#print(df_topics['topic'].unique())

# retrive stock prices for the same time period as above
MIN_DATE = df_communites['date'].min()[:10]
MAX_DATE = df_communites['date'].max()[:10]
df_prices = get_stock_prices(TICKER, start=MIN_DATE, end=MAX_DATE)

In [3]:
df_communites['date'] = pd.to_datetime(df_communites['date'])

df_communites['date'] = df_communites['date'].dt.date

In [4]:
merged_df = pd.merge(df_communites,df_topics,on='id',how='inner')[['date','community_label_str','topic']]

merged_df['combined'] = merged_df['community_label_str'] + ' community talking about ' + merged_df['topic']

In [5]:
df_communites_count = merged_df.groupby(['date', 'community_label_str']).size().reset_index(name='count_community')

df_topic_count = merged_df.groupby(['date', 'topic']).size().reset_index(name='count_topic')

df_combined_count = merged_df.groupby(['date', 'combined']).size().reset_index(name='count_combined')

In [6]:
df_communites_pivot = df_communites_count.pivot(index='date', columns='community_label_str', values='count_community').reset_index()
df_communites_pivot = df_communites_pivot.fillna(0)
df_communites_pivot = df_communites_pivot.set_index('date')

df_topic_pivot = df_topic_count.pivot(index='date', columns='topic', values='count_topic').reset_index()
df_topic_pivot = df_topic_pivot.fillna(0)
df_topic_pivot = df_topic_pivot.set_index('date')

df_combined_pivot = df_combined_count.pivot(index='date', columns='combined', values='count_combined').reset_index()
df_combined_pivot= df_combined_pivot.fillna(0)
df_combined_pivot = df_combined_pivot.set_index('date')

df_prices['log_return'] = np.log(df_prices['GME'] / df_prices['GME'].shift(1))
df_prices = df_prices.fillna(0)
df_prices = df_prices.set_index('date')

In [7]:
df_all = pd.merge(df_prices,df_communites_pivot,left_index=True,right_index=True,how='inner')
df_all = pd.merge(df_all,df_topic_pivot,left_index=True,right_index=True,how='inner')
df_all = pd.merge(df_all,df_combined_pivot,left_index=True,right_index=True,how='inner')
df_all

Unnamed: 0_level_0,GME,log_return,"Core, influential Redditors",Highly controversial,Scriptophobic,Unpopular,"Diamond Hands, Trading GME","GME Observations (Price, Volume, Holders, etc.)","MOD Posts, Many Links",News/Press Releases,...,"Unpopular community talking about Diamond Hands, Trading GME","Unpopular community talking about GME Observations (Price, Volume, Holders, etc.)","Unpopular community talking about MOD Posts, Many Links",Unpopular community talking about News/Press Releases,Unpopular community talking about Platform Terms/Government Regulations,Unpopular community talking about Posts about shorts/short squeeze,Unpopular community talking about Posts of Memes and Photos,Unpopular community talking about Unclear Topic - lots of Twitter Links,"Unpopular community talking about Unclear Topic - mix between future of gamestop, wealth, others",Unpopular community talking about Users Commenting on Other Posts
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04,4.312500,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2021-01-05,4.342500,0.006932,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-06,4.590000,0.055430,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-12,4.987500,0.000501,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021-01-14,9.977500,0.239819,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-24,45.002499,0.018050,0.0,6.0,11.0,36.0,13.0,2.0,3.0,1.0,...,9.0,2.0,3.0,1.0,1.0,1.0,2.0,4.0,7.0,6.0
2021-05-25,52.357498,0.151377,4.0,9.0,24.0,110.0,33.0,10.0,5.0,7.0,...,28.0,8.0,4.0,5.0,7.0,8.0,8.0,2.0,14.0,26.0
2021-05-26,60.639999,0.146860,4.0,8.0,22.0,83.0,33.0,3.0,1.0,6.0,...,25.0,1.0,1.0,4.0,2.0,6.0,11.0,3.0,9.0,21.0
2021-05-27,63.532501,0.046597,1.0,5.0,22.0,62.0,18.0,6.0,1.0,3.0,...,12.0,4.0,1.0,2.0,4.0,11.0,4.0,0.0,9.0,15.0


In [93]:
y = df_all['log_return'].values
X = df_all.copy().drop(columns=['GME','log_return'])


datetime.date(2021, 5, 28)

In [125]:
scaler = StandardScaler()
model = LinearRegression()


pipeline = Pipeline([
    ('scaler', scaler),
    ('regressor', model)
])

train_sizes_proportions = np.linspace(0.1, 1.0, 20)
train_sizes = (train_sizes_proportions * len(X)).astype(int)

date = []
test_scores_mse = []


for train_size in train_sizes:
    y_train = y[:train_size-1]
    X_train = X[:train_size-1]
    
    y_test = y[train_size-1:]
    X_test = X[train_size-1:]
    
    
    model.fit(X_train, y_train)
        
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    date.append(max(X_train.index.values))
    test_scores_mse.append(mean_squared_error(y_test, y_test_pred))


viz_df = pd.DataFrame({'date':date,'test_scores_mse':test_scores_mse})
viz_df["date"] = pd.to_datetime(viz_df["date"])

base = alt.Chart(viz_df).encode(
    x='date',
    y=alt.Y('test_scores_mse:Q',axis=alt.Axis(format='.2f')),
    text = alt.Text('test_scores_mse:Q',format='.2f')
)

c1 = base.mark_line().properties(height=400,width=900)

c2 = base.mark_text(dy=-20).properties(height=400,width=900)

c1 + c2