In [25]:
!pip install -r ../koigawa_milestone_II/requirements.txt

import sys
import os
import pandas as pd
from sklearn.svm import SVR
import altair as alt
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go

# This will allow scripts from Carl's file to be imported
sys.path.append(os.path.abspath('../cdebski_milestone_II'))

from daily_stock_price import get_stock_prices
from supervised_data_prep import pre_process_data, process_reddit_data, feature_selection

Collecting en-core-web-sm@ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 (from -r ../koigawa_milestone_II/requirements.txt (line 18))
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m




In [26]:
# Below is just extract from model comparison - for training data

TICKER = 'GME'               

df_communites = pd.read_csv('../koigawa_milestone_II/community_output_gme_train.csv')
df_topics = pd.read_csv('../smoilanen_milestone_II/df_train.csv')

MIN_DATE = df_communites['date'].min()[:10]
MAX_DATE = df_communites['date'].max()[:10]
df_prices = get_stock_prices(TICKER, start=MIN_DATE, end=MAX_DATE)

df_pre_processed_train = pre_process_data(df_prices, df_communites, df_topics, MIN_DATE, MAX_DATE, TICKER, shift=1, rolling_avg=0, stock_price='log')

# Feature selection based on the training data
FEATURES = feature_selection(df_pre_processed_train, TICKER)[:10]


In [27]:
# Below is just extract from model comparison - for testing data

TICKER = 'GME'               

df_communites = pd.read_csv('../koigawa_milestone_II/community_output_gme_test.csv')
df_topics = pd.read_csv('../smoilanen_milestone_II/df_test.csv')

MIN_DATE = df_communites['date'].min()[:10]
MAX_DATE = df_communites['date'].max()[:10]
df_prices = get_stock_prices(TICKER, start=MIN_DATE, end=MAX_DATE)

df_pre_processed_test = pre_process_data(df_prices, df_communites, df_topics, MIN_DATE, MAX_DATE, TICKER, shift=1, rolling_avg=0, stock_price='log')

In [28]:
# Not all columns chosen in training will be present in test, so this creates common features

MISSING_FEATURES = [feature for feature in FEATURES if feature not in df_pre_processed_test.columns]

In [30]:
FEATURES

['Scriptophobic community talking about Meme & Photo Posts',
 'Core, influential Redditors community talking about MOD (moderator) Announcements',
 'Unpopular community talking about Brokerage Accounts_shift1',
 'Scriptophobic community talking about The GME Short Squeeze',
 "Unpopular community talking about Unity Amoung 'Apes'_shift1",
 'Unpopular community talking about Unclear Topic - Some posts about holding, FINRA, and a lot of external links_shift1',
 'Unpopular community talking about MOD (moderator) Announcements',
 'Scriptophobic community talking about Gamestop as a Business/Store_shift1',
 'MOD (moderator) Announcements',
 'Unpopular community talking about Regulatory Matters (Direct and Indirect)']

In [31]:
MISSING_FEATURES

for missing in MISSING_FEATURES:
    df_pre_processed_test[missing] = 0

In [33]:
df_pre_processed_test.set_index('date',inplace=True)

X_test = df_pre_processed_test[FEATURES]
y_test = df_pre_processed_test[TICKER]

In [34]:
df_pre_processed_train.set_index('date',inplace=True)

X_train = df_pre_processed_train[FEATURES]
y_train = df_pre_processed_train[TICKER]

In [35]:
best_params = pd.read_csv('../cdebski_milestone_II/results.csv')

best_params = best_params[best_params.best_mse == best_params['best_mse'].min()]

best_params

Unnamed: 0,model,features,best_params,best_mse
1,Support Vector (SVR),Scriptophobic community talking about Meme & P...,"{'Support Vector (SVR)__C': 0.1, 'Support Vect...",0.015988


In [36]:
best_params.best_params.values[0]

"{'Support Vector (SVR)__C': 0.1, 'Support Vector (SVR)__gamma': 0.1, 'Support Vector (SVR)__kernel': 'rbf'}"

In [37]:
c_values = np.logspace(-2, 0, 10)  
gamma_values = np.logspace(-2, 0, 10)

df_params = pd.DataFrame(columns=['x', 'y'])

for c in c_values:
    for gamma in gamma_values:
        
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('svr', SVR())])

        
        y_predict = pipeline.set_params(svr__C=c,svr__gamma=gamma).fit(X_train, y_train).predict(X_test)
        mse = mean_squared_error(y_predict,y_test)
        
        temp_df = pd.DataFrame({'x': [c], 'y': [gamma],'mse':[mse]})
        df_params = pd.concat([df_params,temp_df])
        
df_params

Unnamed: 0,x,y,mse
0,0.01,0.010000,0.003248
0,0.01,0.016681,0.003254
0,0.01,0.027826,0.003284
0,0.01,0.046416,0.003280
0,0.01,0.077426,0.003370
...,...,...,...
0,1.00,0.129155,0.007201
0,1.00,0.215443,0.008302
0,1.00,0.359381,0.009066
0,1.00,0.599484,0.011369


In [38]:
z_p = []
for i in np.unique(df_params.y):
    z_p.append(df_params[df_params.y == i]["mse"].values)


x_p = c_values
y_p = gamma_values

fig = go.Figure(data=[go.Surface(z=z_p, x=x_p, y=y_p)])

fig.update_layout(
    title="Hyperparameter Tuning Result",
    autosize=True,
    width=1000,
    height=1000,
    scene=dict(
        xaxis=dict(title="C"),
        yaxis=dict(title="Gamma"),
        zaxis=dict(title="Mean Squared Error on Log Return"),
    ),
)

fig.show()