In [1]:
!pip install -r ../koigawa_milestone_II/requirements.txt

import sys
import os
import pandas as pd
from sklearn.svm import SVR
import altair as alt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# This will allow scripts from Carl's file to be imported
sys.path.append(os.path.abspath('../cdebski_milestone_II'))

from daily_stock_price import get_stock_prices
from supervised_data_prep import pre_process_data, process_reddit_data, feature_selection


Collecting en-core-web-sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 (from -r ../koigawa_milestone_II/requirements.txt (line 18))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m




In [2]:
# Below is just extract from model comparison

TICKER = 'GME'               

df_communites = pd.read_csv('../koigawa_milestone_II/community_output_gme_train.csv')
df_topics = pd.read_csv('../smoilanen_milestone_II/df_train.csv')

MIN_DATE = df_communites['date'].min()[:10]
MAX_DATE = df_communites['date'].max()[:10]
df_prices = get_stock_prices(TICKER, start=MIN_DATE, end=MAX_DATE)

df_pre_processed = pre_process_data(df_prices, df_communites, df_topics, MIN_DATE, MAX_DATE, TICKER, shift=1, rolling_avg=0, stock_price='log')

FEATURES = feature_selection(df_pre_processed, TICKER)[:10]

In [3]:
# Trying to see how the learning curve is influenced by number of posts (for visualization use only)

reddit_df = df_communites.merge(df_topics,on='id',how='inner')
reddit_df['date'] = pd.to_datetime(reddit_df['date'])
reddit_df['date'] = reddit_df['date'].dt.date
reddit_df = reddit_df.groupby(['date'],as_index=False).count()[['date','id']]
reddit_df["date"] = pd.to_datetime(reddit_df["date"])
reddit_df['Encoding'] = 'Number of Posts'

In [4]:
best_params = pd.read_csv('../cdebski_milestone_II/results.csv')

best_params = best_params[best_params.best_mse == best_params['best_mse'].min()]

best_params

Unnamed: 0,model,features,best_params,best_mse
1,Support Vector (SVR),Scriptophobic community talking about Meme & P...,"{'Support Vector (SVR)__C': 0.1, 'Support Vect...",0.015988


In [5]:
best_params.best_params.values[0]

"{'Support Vector (SVR)__C': 0.1, 'Support Vector (SVR)__gamma': 0.1, 'Support Vector (SVR)__kernel': 'rbf'}"

In [6]:
# Configured the model based on the best parameter

best_model = SVR(kernel='rbf',C=0.1,gamma=0.1)

In [7]:
# Setting date as index so that its value can be collected while plotting the chart

df_pre_processed.set_index('date',inplace=True)

In [8]:
# Define X and y

X = df_pre_processed[FEATURES]
y = df_pre_processed[TICKER]

In [11]:
train_sizes_proportions = np.linspace(0.1, 1.0, 20)
train_sizes = (train_sizes_proportions * len(X)).astype(int)

date = []
test_scores_mse = []


for train_size in train_sizes:
    y_train = y[:train_size-1]
    X_train = X[:train_size-1]
    
    y_test = y[train_size-1:]
    X_test = X[train_size-1:]
    
    
    best_model.fit(X_train, y_train)
        
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    
    date.append(max(X_train.index.values))
    test_scores_mse.append(mean_squared_error(y_test, y_test_pred))


viz_df = pd.DataFrame({'date':date,'test_scores_mse':test_scores_mse})
viz_df["date"] = pd.to_datetime(viz_df["date"])
viz_df['Encoding'] = 'MSE'

base = alt.Chart(viz_df).encode(
    x='date',
    y=alt.Y('test_scores_mse:Q',axis=alt.Axis(format='.2f',title='Mean Squared Error (MSE) of Log Returns'),scale=alt.Scale(domain=[0, 0.08])),
    text = alt.Text('test_scores_mse:Q',format='.2f'),
    color=alt.Color('Encoding')
)

rect_data = pd.DataFrame({
    "x1": [X.index[40]],
    "x2": [X.index[85]]
})

rect = alt.Chart(rect_data).mark_rect(opacity=0.1).encode(
    x=alt.X("x1:T",axis=alt.Axis(title=None)),
    x2=alt.X2("x2:T"),
    color=alt.ColorValue("#FF0000")
)

c1 = base.mark_line(size=2).properties(height=400,width=900)

c2 = base.mark_text(dy=-25,dx=-5).properties(height=400,width=900)

bar = alt.Chart(reddit_df).mark_bar().encode(
    x='date:T',
    y=alt.Y('id:Q',axis=alt.Axis(title='Number of Posts'),scale=alt.Scale(domain=[0, 5000])),
    color=alt.Color('Encoding')
)


alt.layer((rect + bar),(c1 + c2)).resolve_scale(
    y='independent'  
).configure_axis(
    grid=False  
).configure_legend(
    title=None,
    orient='top'  
).properties(
    width=900,
    height=250,
    title = alt.TitleParams(
    text='  Learning Curve',
    fontSize=25,
    subtitle='Showing decline in MSE as the training data point increases',
    anchor='start'
    )
)