In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/eda_data.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume,Exchange,Shortname,...,sma12,sma26,ema12,ema26,diff_ema12,diff_ema26,macd,macd_sma9,signal_line,macd_diff_signal
0,2010-01-04,MMM,59.318886,83.019997,83.449997,82.669998,83.089996,3043700.0,NYQ,3M Company,...,,,,,,,,,,
1,2010-01-05,MMM,58.947342,82.5,83.230003,81.699997,82.800003,2847000.0,NYQ,3M Company,...,,,,,,,,,,
2,2010-01-06,MMM,59.783295,83.669998,84.599998,83.510002,83.879997,5268500.0,NYQ,3M Company,...,,,,,,,,,,
3,2010-01-07,MMM,59.826176,83.730003,83.760002,82.120003,83.32,4470100.0,NYQ,3M Company,...,,,,,,,,,,
4,2010-01-08,MMM,60.247749,84.32,84.32,83.300003,83.690002,3405800.0,NYQ,3M Company,...,,,,,,,,,,


In [4]:
df.columns

Index(['Date', 'Symbol', 'Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume',
       'Exchange', 'Shortname', 'Longname', 'Sector', 'Industry',
       'Currentprice', 'Marketcap', 'Ebitda', 'Revenuegrowth', 'City', 'State',
       'Country', 'Fulltimeemployees', 'Longbusinesssummary', 'Weight',
       'cap_grouping', 'tenday_avg', 'next_close', 'prev_close', 'return',
       'diff_day', 'diff_10day', 'up', 'gain_loss', 'sma12', 'sma26', 'ema12',
       'ema26', 'diff_ema12', 'diff_ema26', 'macd', 'macd_sma9', 'signal_line',
       'macd_diff_signal'],
      dtype='object')

In [5]:
#Separating the categorical columns that are of interest to us and making  dummy variables
#for them with drop_first=True so that we do not have unnecessary information.
cat_cols = ['Exchange', 'Sector', 'cap_grouping']
dummy_df = pd.get_dummies(df[cat_cols], drop_first=True)

In [6]:
#Putting the dummy variables into the main dataframe
df = df.join(dummy_df)
df.drop(columns=['Exchange', 'Sector', 'cap_grouping'], inplace=True)

In [7]:
#up is a bool so we can just make it an integer
df['up'] = df['up'].astype(int)

In [8]:
#Picking the features we want to use for X and for y and assigning them to variables
X = df.loc[:,['return', 'gain_loss', 'macd', 'macd_diff_signal', \
              'diff_day', 'diff_10day', 'diff_ema12', 'diff_ema26',\
              'Exchange_NGM', 'Exchange_NMS', 'Exchange_NYQ', \
              'Sector_Communication Services', 'Sector_Consumer Cyclical', \
              'Sector_Consumer Defensive', 'Sector_Energy', \
              'Sector_Financial Services', 'Sector_Healthcare', 'Sector_Industrials', \
              'Sector_Real Estate', 'Sector_Technology', 'Sector_Utilities', \
              'cap_grouping_medium', 'cap_grouping_small']]
y = df.loc[:,['next_close']]

In [9]:
#Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [10]:
#Instantiating a scaler, fitting it to the training data ONLY and transforming
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)