# cs5044 final project experiment 2a: predicting over entire dataset

In [1]:
%matplotlib inline
import math
import pandas as pd
from pandas.plotting import lag_plot
from IPython.display import display
from sklearn import preprocessing
from sklearn import linear_model, neighbors
from sklearn import tree
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# preprocessing

### read in data

In [2]:
df_index = pd.read_csv('sp500_index.csv')
df_stocks = pd.read_csv('sp500_stocks.csv')
df_companies = pd.read_csv('sp500_companies.csv')

### convert to datetime objects

In [3]:
df_stocks['Date'] = pd.to_datetime(df_stocks['Date'], utc=True, errors='coerce')
df_stocks['Date'] = pd.DatetimeIndex(df_stocks['Date']).date

df_index['Date'] = pd.to_datetime(df_index['Date'], utc=True)
df_index['Date'] = pd.DatetimeIndex(df_index['Date']).date

### merge

In [4]:
df = df_companies.merge(df_stocks, on='Symbol', how='inner')
df = df.merge(df_index, on='Date')
df.head()

Unnamed: 0,Exchange,Symbol,Shortname,Longname,Sector,Industry,Currentprice,Marketcap,Ebitda,Revenuegrowth,...,Longbusinesssummary,Weight,Date,Adj Close,Close,High,Low,Open,Volume,S&P500
0,NMS,AAPL,Apple Inc.,Apple Inc.,Technology,Consumer Electronics,140.42,2256661643264,129557000000.0,0.019,...,"Apple Inc. designs, manufactures, and markets ...",0.070093,2012-10-11,19.372286,22.432142,23.114286,22.432142,23.089287,546081200.0,1432.84
1,NMS,MSFT,Microsoft Corporation,Microsoft Corporation,Technology,Software—Infrastructure,229.25,1721204342784,97983000000.0,0.124,...,"Microsoft Corporation develops, licenses, and ...",0.053462,2012-10-11,23.988226,28.950001,29.25,28.870001,29.219999,41488500.0,1432.84
2,NMS,GOOGL,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,97.86,1281926889472,96887000000.0,0.126,...,Alphabet Inc. provides various products and pl...,0.039818,2012-10-11,18.805805,18.805805,18.981482,18.776026,18.841341,95260644.0,1432.84
3,NMS,GOOG,Alphabet Inc.,Alphabet Inc.,Communication Services,Internet Content & Information,98.71,1281502609408,96887000000.0,0.126,...,Alphabet Inc. provides various products and pl...,0.039804,2012-10-11,18.716845,18.716845,18.891689,18.687206,18.752213,95713418.0,1432.84
4,NMS,AMZN,"Amazon.com, Inc.","Amazon.com, Inc.",Consumer Cyclical,Internet Retail,113.67,1158024396800,52620000000.0,0.072,...,"Amazon.com, Inc. engages in the retail sale of...",0.035969,2012-10-11,12.211,12.211,12.465,12.0945,12.4,68946000.0,1432.84


### drop irrelevant columns

In [5]:
df.drop(labels=['Shortname', 'Longname', 'City', 'Longbusinesssummary', 'Date'], axis=1, inplace=True)

### one hot encode 

In [None]:
ohe = preprocessing.OneHotEncoder(dtype=int, sparse=False, handle_unknown="ignore")
data = ohe.fit_transform(df[['Exchange', 'Sector', 'Industry', 'State', 'Country']])
cats = pd.DataFrame(data, columns=ohe.get_feature_names())
snp_df = pd.concat([cats, df], axis=1)
snp_df.drop(columns=['Exchange', 'Sector', 'Industry', 'State', 'Country'], inplace=True)

snp_df

### check % of missing values

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 250
percent_missing = snp_df.isnull().sum() * 100 / len(snp_df)
missing_value_df = pd.DataFrame({'column_name': snp_df.columns,
'percent_missing': percent_missing})

missing_value_df

### drop missing values

In [None]:
snp_df.dropna(inplace=True)
snp_df

### separate target and features

In [None]:
snp_xvalues = snp_df.drop(["Adj Close", "Close", "High", "Low", "Open", "Symbol"], axis=1)
snp_target = snp_df["Adj Close"]

# linear regression

In [None]:
reg = linear_model.LinearRegression()

kf = model_selection.KFold(n_splits=5, shuffle=False)

r2scores = []
MSEscores = []

for train_index, test_index in kf.split(snp_df[:100000]):
    X_train, X_test = snp_xvalues.iloc[train_index], snp_xvalues.iloc[test_index]
    y_train, y_test = snp_target.iloc[train_index], snp_target.iloc[test_index]
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    r2scores.append(metrics.r2_score(y_test, y_pred))
    MSEscores.append(math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print("r2=  {}".format(metrics.r2_score(y_test, y_pred)))
    print("root MSE= {}".format(math.sqrt(metrics.mean_squared_error(y_test, y_pred))))
    print()

print("average r2=  {}".format(np.mean(r2scores)))
print("std=  {}".format(np.std(r2scores)))
print("average root MSE=  {}".format(np.mean(MSEscores)))
print("std=  {}".format(np.std(MSEscores)))
print()

print("predicted:")
print(y_pred[-25:])
print("ground truth:")
print(snp_target[-25:].values)

# kNN regression

### k=1

In [None]:
knn_reg = neighbors.KNeighborsRegressor(n_neighbors=1, weights='uniform')

kf = model_selection.KFold(n_splits=5, shuffle=False)

r2scores = []
MSEscores = []

for train_index, test_index in kf.split(snp_df[:100000]):
    X_train, X_test = snp_xvalues.iloc[train_index], snp_xvalues.iloc[test_index]
    y_train, y_test = snp_target.iloc[train_index], snp_target.iloc[test_index]
    knn_reg.fit(X_train, y_train)
    y_pred = knn_reg.predict(X_test)
    r2scores.append(metrics.r2_score(y_test, y_pred))
    MSEscores.append(math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print("r2=  {}".format(metrics.r2_score(y_test, y_pred)))
    print("root MSE= {}".format(math.sqrt(metrics.mean_squared_error(y_test, y_pred))))
    print()

print("average r2=  {}".format(np.mean(r2scores)))
print("std=  {}".format(np.std(r2scores)))
print("average root MSE=  {}".format(np.mean(MSEscores)))
print("std=  {}".format(np.std(MSEscores)))
print()

print("predicted:")
print(y_pred[-25:])
print("ground truth:")
print(snp_target[-25:].values)

### k=3

In [None]:
knn_reg = neighbors.KNeighborsRegressor(n_neighbors=3, weights='uniform')

kf = model_selection.KFold(n_splits=5, shuffle=True)

r2scores = []
MSEscores = []

for train_index, test_index in kf.split(snp_df[:100000]):
    X_train, X_test = snp_xvalues.iloc[train_index], snp_xvalues.iloc[test_index]
    y_train, y_test = snp_target.iloc[train_index], snp_target.iloc[test_index]
    knn_reg.fit(X_train, y_train)
    y_pred = knn_reg.predict(X_test)
    r2scores.append(metrics.r2_score(y_test, y_pred))
    MSEscores.append(math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print("r2=  {}".format(metrics.r2_score(y_test, y_pred)))
    print("root MSE= {}".format(math.sqrt(metrics.mean_squared_error(y_test, y_pred))))
    print()

print("average r2=  {}".format(np.mean(r2scores)))
print("std=  {}".format(np.std(r2scores)))
print("average root MSE=  {}".format(np.mean(MSEscores)))
print("std=  {}".format(np.std(MSEscores)))
print()

print("predicted:")
print(y_pred[-25:])
print("ground truth:")
print(snp_target[-25:].values)

### k=5

In [None]:
knn_reg = neighbors.KNeighborsRegressor(n_neighbors=5, weights='uniform')

kf = model_selection.KFold(n_splits=5, shuffle=True)

r2scores = []
MSEscores = []

for train_index, test_index in kf.split(snp_df[:100000]):
    X_train, X_test = snp_xvalues.iloc[train_index], snp_xvalues.iloc[test_index]
    y_train, y_test = snp_target.iloc[train_index], snp_target.iloc[test_index]
    knn_reg.fit(X_train, y_train)
    y_pred = knn_reg.predict(X_test)
    r2scores.append(metrics.r2_score(y_test, y_pred))
    MSEscores.append(math.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print("r2=  {}".format(metrics.r2_score(y_test, y_pred)))
    print("root MSE= {}".format(math.sqrt(metrics.mean_squared_error(y_test, y_pred))))
    print()

print("average r2=  {}".format(np.mean(r2scores)))
print("std=  {}".format(np.std(r2scores)))
print("average root MSE=  {}".format(np.mean(MSEscores)))
print("std=  {}".format(np.std(MSEscores)))
print()

print("predicted:")
print(y_pred[-25:])
print("ground truth:")
print(snp_target[-25:].values)

# neural network

In [None]:
X = snp_xvalues.to_numpy()
y = snp_target.to_numpy()

## activation='relu'

### layers=1, size=100

In [None]:
kfold = model_selection.KFold(5, shuffle=True, random_state=2)

r2= []
RMSE = []

for train_idx, test_idx in kfold.split(snp_df[:100000]):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    mlp_regr = MLPRegressor(hidden_layer_sizes=(100), max_iter=2000, activation="relu", random_state=2)
    X_scaler = preprocessing.MinMaxScaler()
    
    X_train = X_scaler.fit_transform(X_train)    
    
    mlp_regr.fit(X_train, y_train)
    
    X_test = X_scaler.transform(X_test)
    y_pred = mlp_regr.predict(X_test)
    
    r2 += [metrics.r2_score(y_test, y_pred)]
    RMSE += [math.sqrt(metrics.mean_squared_error(y_test, y_pred))]

print("r2    = {:.4f} ±{:.4f}".format(np.mean(r2), np.std(r2)))
print("RMSE = {:.4f} ±{:.4f}".format(np.mean(RMSE), np.std(RMSE)))

### layers=2, size=100

In [None]:
kfold = model_selection.KFold(5, shuffle=True, random_state=2)

r2= []
RMSE = []

for train_idx, test_idx in kfold.split(snp_df[:100000]):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    mlp_regr = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=2000, activation="relu", random_state=2)
    X_scaler = preprocessing.MinMaxScaler()
    
    X_train = X_scaler.fit_transform(X_train)    
    
    mlp_regr.fit(X_train, y_train)
    
    X_test = X_scaler.transform(X_test)
    y_pred = mlp_regr.predict(X_test)
    
    r2 += [metrics.r2_score(y_test, y_pred)]
    RMSE += [math.sqrt(metrics.mean_squared_error(y_test, y_pred))]

print("r2    = {:.4f} ±{:.4f}".format(np.mean(r2), np.std(r2)))
print("RMSE = {:.4f} ±{:.4f}".format(np.mean(RMSE), np.std(RMSE)))

## activation='tanh'

### layers=1, size=100

In [None]:
kfold = model_selection.KFold(5, shuffle=True, random_state=2)

r2= []
RMSE = []

for train_idx, test_idx in kfold.split(snp_df[:100000]):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    mlp_regr = MLPRegressor(hidden_layer_sizes=(100), max_iter=2000, activation="tanh", random_state=2)
    X_scaler = preprocessing.MinMaxScaler()
    
    X_train = X_scaler.fit_transform(X_train)    
    
    mlp_regr.fit(X_train, y_train)
    
    X_test = X_scaler.transform(X_test)
    y_pred = mlp_regr.predict(X_test)
    
    r2 += [metrics.r2_score(y_test, y_pred)]
    RMSE += [math.sqrt(metrics.mean_squared_error(y_test, y_pred))]

print("r2    = {:.4f} ±{:.4f}".format(np.mean(r2), np.std(r2)))
print("RMSE = {:.4f} ±{:.4f}".format(np.mean(RMSE), np.std(RMSE)))

### layers=2, size=100

In [None]:
kfold = model_selection.KFold(5, shuffle=True, random_state=2)

r2= []
RMSE = []

for train_idx, test_idx in kfold.split(snp_df[:100000]):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    mlp_regr = MLPRegressor(hidden_layer_sizes=(100), max_iter=2000, activation="tanh", random_state=2)
    X_scaler = preprocessing.MinMaxScaler()
    
    X_train = X_scaler.fit_transform(X_train)    
    
    mlp_regr.fit(X_train, y_train)
    
    X_test = X_scaler.transform(X_test)
    y_pred = mlp_regr.predict(X_test)
    
    r2 += [metrics.r2_score(y_test, y_pred)]
    RMSE += [math.sqrt(metrics.mean_squared_error(y_test, y_pred))]

print("r2    = {:.4f} ±{:.4f}".format(np.mean(r2), np.std(r2)))
print("RMSE = {:.4f} ±{:.4f}".format(np.mean(RMSE), np.std(RMSE)))