In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import altair as alt

**Model Comparison** 

The cell below uses the identified stock or index and compares the MSE results of three models based on the highest r2 ranked features relative to the stock price. The three models are Linear Regression, Support Vector Regression, and Decision Tree Regression. Models are tuned using gridsearch to compare an optimized version. The best parameters, features, MSE, and Std of cross validation are returned in a dataframe.

In [3]:
from daily_stock_price import get_stock_prices
from supervised_data_prep import pre_process_data, feature_selection
from supervised_modeling import model_comparison

# identify the ticker for the selected stock / index
TICKER = 'GME'               

# retrive training and test reddit data for model comparison and scoring (note: validation set is not included in data below, data > Sept30th)
df_communites = pd.read_csv('../koigawa_milestone_II/community_output_gme_train.csv')
df_topics = pd.read_csv('../smoilanen_milestone_II/df_train.csv')

# retrive stock prices for the same time period as above
MIN_DATE = df_communites['date'].min()[:10]
MAX_DATE = df_communites['date'].max()[:10]
df_prices = get_stock_prices(TICKER, start=MIN_DATE, end=MAX_DATE)

# data from market and reddit is combined, cleaned
# selections for feature shifting, rolling average, or stock price edits can be made here
df_pre_processed = pre_process_data(df_prices, df_communites, df_topics, MIN_DATE, MAX_DATE, TICKER, shift=1, rolling_avg=0, stock_price='log')

# select the top 10 features based on their correlation with the target variable
FEATURES = feature_selection(df_pre_processed, TICKER)[:10]

# compare three models using different feature selections
results = model_comparison(df_pre_processed, TICKER, features=FEATURES)
results

Unnamed: 0,model,features,best_params,best_mse,best_std
0,Linear,Scriptophobic community talking about Meme & P...,{'Linear__fit_intercept': True},0.295654,0.4662266167128731
1,Support Vector (SVR),Scriptophobic community talking about Meme & P...,"{'Support Vector (SVR)__C': 0.1, 'Support Vect...",0.017218,0.0161571540644641
2,Decision Tree,Scriptophobic community talking about Meme & P...,"{'Decision Tree__criterion': 'squared_error', ...",0.140646,0.2163596891403598


**Outputs**  

For convienience, the pre-processed features and results from the table above are saved in csv files labeled 'pre_processed_data.csv' and 'results.csv', respectively. 

**Selecting Features**

Features are determined based on the highest correlation with the target variable, stock price. 

In [7]:
df_features = df_pre_processed.copy()
df_features.drop('date', axis=1, inplace=True)
  
# dataframe to capture r2 results for each feature
df_r2 = pd.DataFrame(columns=['feature', 'r2'])
df_r2['feature'] = df_features.columns

# calucate r2 for each feature with the stock price
for col in df_features.columns:
    correlation_matrix = df_features.corr()
    correlation = correlation_matrix.loc[TICKER, col]
    r_squared = correlation**2
    df_r2.loc[df_r2['feature']==col, ['r2']] = r_squared

# highest correlations listed first
df_r2.sort_values(by='r2', ascending =False, inplace=True)

# remove correlation with self
df_r2 = df_r2[df_r2['feature'] != TICKER]

# visualize top 10 results
df_r2.head(10)

Unnamed: 0,feature,r2
24,Scriptophobic community talking about Meme & P...,0.030267
4,"Core, influential Redditors community talking ...",0.012274
81,Unpopular community talking about Brokerage Ac...,0.012248
26,Scriptophobic community talking about The GME ...,0.009395
90,Unpopular community talking about Unity Amoung...,0.007291
89,Unpopular community talking about Unclear Topi...,0.006972
32,Unpopular community talking about MOD (moderat...,0.006463
73,Scriptophobic community talking about Gamestop...,0.006331
46,MOD (moderator) Announcements,0.006067
35,Unpopular community talking about Regulatory M...,0.004434


While the initial correlation of communities / topics to stock price was .3 - .4, we've seen a significant drop after re-running the topic model to only use the period prior to May 31 for the training data. 

**Reddit Posts** 

Reddit data is provided including labels for the topic / community. 

In [10]:
# retrieve community reddit data
df = pd.read_csv('../koigawa_milestone_II/community_output_gme_train.csv')
df['date'] = df['date'].astype('datetime64[ns]')
df.head()

Unnamed: 0.1,Unnamed: 0,id,author,date,selftext,community_label,community_label_str
0,0,kqfajb,TitsDownOnly,2021-01-04 19:02:26,After watching this I took a position RIGHT AW...,0.0,Highly controversial
1,1,kqvp7l,TitsDownOnly,2021-01-05 10:19:59,This guy explained exactly how to take a posit...,0.0,Highly controversial
2,2,kuo3w1,TitsDownOnly,2021-01-10 21:59:17,"After some downwards movement, I think everyb...",0.0,Highly controversial
3,3,ky1jb0,TitsDownOnly,2021-01-15 19:14:21,[removed],0.0,Highly controversial
4,4,ky1q3r,TitsDownOnly,2021-01-15 19:23:33,[removed],0.0,Highly controversial


In [12]:
# retrieve topic reddit data
df = pd.read_csv('../smoilanen_milestone_II/df_train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,datetime,topic,prime_topic,sec_topic,prime_score,sec_score,topic_scores
0,0,kqfajb,2021-01-04 19:02:26,The GME Short Squeeze,2,6,0.44297,0.44272,"[0.014291147039629852, 0.014287828586784674, 0..."
1,1,kqvp7l,2021-01-05 10:19:59,Mix - GME Price Movements and Nonsense Posts,6,0,0.601451,0.33699,"[0.33699002249896887, 0.007693001373190706, 0...."
2,2,krnthg,2021-01-06 13:28:54,Gamestop as a Business/Store,0,5,0.80454,0.129626,"[0.8045399378527857, 0.0029414200420169708, 0...."
3,3,kuo3w1,2021-01-10 21:59:17,General Posts/Knowledge About Understanding St...,8,7,0.423696,0.412991,"[0.005001643663542764, 0.005000870972639498, 0..."
4,4,kv1w9e,2021-01-11 12:42:49,Regulatory Matters (Direct and Indirect),5,0,0.818236,0.102985,"[0.102984889026295, 0.0012196594551367487, 0.0..."


**Stock Price History (daily_stock_price.py)**

Retrives the stock prices for the ticker and timeline provided.

In [51]:
# index / stock options 
    # VOO: S&P 500 index
    # FSMDX: Mid cap index fund
    # SFSNX: Mid cap index fund 
    # XLK: Technology sector stock index 
    # MEME: Meme stock index 
    # GME: Gamestop stock

# retrive stock price data, saves csv and chart png
from daily_stock_price import get_stock_prices
TICKER = 'GME'
df_stock = get_stock_prices(ticker = TICKER, start = MIN_DATE, end = MAX_DATE)
df_stock['date'] = pd.to_datetime(df_stock['date'])
df_stock


Unnamed: 0,date,GME
0,2021-01-05,4.342500
1,2021-01-06,4.590000
2,2021-01-07,4.520000
3,2021-01-08,4.422500
4,2021-01-11,4.985000
...,...,...
181,2021-09-23,47.810001
182,2021-09-24,46.290001
183,2021-09-27,47.369999
184,2021-09-28,44.650002


**Visualize Community / Topic Activity**

Viewing the comunity group & topic activity and changes in the stock price.

In [55]:
df_visual = df_pre_processed.copy()

# sum the activity of all features (communities and topics) and merge with actual stock price
feature_cols = df_visual.columns[2:]
df_visual['Total'] = df_visual[feature_cols].sum(axis=1)
df_visual = df_visual[['date', 'Total']]
df_visual['date'] = pd.to_datetime(df_visual['date'])
df_visual = df_visual.merge(df_stock, on='date')


# visualize 
base = alt.Chart(df_visual).encode(x='date')

# line chart of stock price
line1 = base.mark_line(color='blue').encode(
    y=alt.Y(TICKER, axis=alt.Axis(title='Stock Price')))

# line chart of Reddit activity
line2 = base.mark_line(color='red').encode(
    y=alt.Y('Total', axis=alt.Axis(title='Reddit Activity'))
).transform_calculate(
    y2='datum.Total')  # Creates a second y-axis

# combine charts
dual_axis_chart = alt.layer(
    line1,
    line2
).resolve_scale(
    y='independent'  # Ensure independent y-axis scales
).properties(
    title='GME Stock Price and Reddit Activity')

dual_axis_chart
