In [1]:
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib import style
import matplotlib.dates as mdates
from pylab import rcParams
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import mplfinance as mpf
import bs4 as bs
import pickle
import requests
import os
from collections import Counter
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

test_size = 0.2
valid_size = 0.2
N = 21

n_estimators = 100
max_depth = 3
learning_rate = 0.1
min_child_weight = 1
subsample = 1
colsample_bytree = 1
colsample_bylevel = 1
gamma = 0
model_seed = 100

style.use('seaborn-darkgrid')

  from pandas.util.testing import assert_frame_equal


In [2]:
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[0].text
    ticker = ticker[:-1]
    tickers.append(ticker)
for n, i in enumerate(tickers):
    if i == 'BRK.B':
        tickers[n] = 'BRKB'
    elif i == 'BF.B':
        tickers[n] = 'BFB'

tickers

['MMM',
 'ABT',
 'ABBV',
 'ABMD',
 'ACN',
 'ATVI',
 'ADBE',
 'AMD',
 'AAP',
 'AES',
 'AFL',
 'A',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALXN',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'ANTM',
 'AON',
 'AOS',
 'APA',
 'AIV',
 'AAPL',
 'AMAT',
 'APTV',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'BKR',
 'BLL',
 'BAC',
 'BK',
 'BAX',
 'BDX',
 'BRKB',
 'BBY',
 'BIO',
 'BIIB',
 'BLK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BFB',
 'CHRW',
 'COG',
 'CDNS',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'CNC',
 'CNP',
 'CTL',
 'CERN',
 'CF',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CTXS',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA',
 'CAG',
 'CXO',


In [3]:
df = pd.read_csv('sp500_joined_closes.csv', index_col = 0)
df.head()

Unnamed: 0,Date,MMM,MMM_HL_PCT_DIFF,MMM_PCT_CHNG,ABT,ABT_HL_PCT_DIFF,ABT_PCT_CHNG,ABBV,ABBV_HL_PCT_DIFF,ABBV_PCT_CHNG,...,ZBRA_PCT_CHNG,ZBH,ZBH_HL_PCT_DIFF,ZBH_PCT_CHNG,ZION,ZION_HL_PCT_DIFF,ZION_PCT_CHNG,ZTS,ZTS_HL_PCT_DIFF,ZTS_PCT_CHNG
0,2010-01-04,62.327927,0.009435,-0.000842,18.082745,0.011869,0.004982,,,,...,0.007025,55.568928,0.013994,0.011971,11.82796,0.038043,0.030139,,,
1,2010-01-05,61.93755,0.018727,-0.003623,17.936651,0.013395,-0.008261,,,,...,0.002452,57.328018,0.021929,0.009785,12.244997,0.045352,0.039157,,,
2,2010-01-06,62.815929,0.013052,-0.002504,18.036259,0.010028,0.007045,,,,...,0.036118,57.309509,0.010574,-0.002578,13.309779,0.098829,0.097293,,,
3,2010-01-07,62.86095,0.019971,0.004921,18.185675,0.013131,0.00847,,,,...,-0.019129,58.624191,0.024775,0.007478,14.800472,0.137124,0.114228,,,
4,2010-01-08,63.303928,0.012245,0.007528,18.278645,0.010424,0.005296,,,,...,0.000725,57.392849,0.028478,0.002912,14.5609,0.073454,0.013589,,,


In [4]:
df.tail()

Unnamed: 0,Date,MMM,MMM_HL_PCT_DIFF,MMM_PCT_CHNG,ABT,ABT_HL_PCT_DIFF,ABT_PCT_CHNG,ABBV,ABBV_HL_PCT_DIFF,ABBV_PCT_CHNG,...,ZBRA_PCT_CHNG,ZBH,ZBH_HL_PCT_DIFF,ZBH_PCT_CHNG,ZION,ZION_HL_PCT_DIFF,ZION_PCT_CHNG,ZTS,ZTS_HL_PCT_DIFF,ZTS_PCT_CHNG
2677,2020-08-21,161.710007,0.012131,0.004847,102.400002,0.016858,0.007577,94.860001,0.010602,-0.003571,...,-0.007498,136.160004,0.017463,-0.004533,31.59,0.031918,-0.015274,158.369995,0.012914,-0.002457
2678,2020-08-24,163.529999,0.017375,0.012256,101.389999,0.025124,-0.012756,94.510002,0.011255,-0.00411,...,-0.014246,136.809998,0.02432,-0.002261,33.040001,0.048193,0.041943,156.759995,0.022662,-0.018717
2679,2020-08-25,163.100006,0.017424,-0.010676,102.470001,0.013593,0.004805,94.059998,0.009371,-0.001592,...,0.005937,137.600006,0.015128,-0.004846,33.060001,0.039336,-0.011364,158.580002,0.014236,0.003734
2680,2020-08-26,163.899994,0.019049,0.006324,103.190002,0.020047,0.011766,94.349998,0.015817,0.009091,...,0.023471,136.830002,0.013988,-0.001824,32.279999,0.02789,-0.026831,159.759995,0.02029,0.013834
2681,2020-08-27,163.300003,0.012832,-0.007295,110.900398,0.038315,-0.000987,94.5,0.014839,0.001802,...,-0.038993,138.990005,0.019453,0.009002,33.279999,0.038069,0.029384,159.792404,0.012385,-0.001297


In [5]:
df.isnull().sum()

Date                  0
MMM                   0
MMM_HL_PCT_DIFF       0
MMM_PCT_CHNG          0
ABT                   0
                   ... 
ZION_HL_PCT_DIFF      0
ZION_PCT_CHNG         0
ZTS                 775
ZTS_HL_PCT_DIFF     775
ZTS_PCT_CHNG        775
Length: 1516, dtype: int64

In [6]:
df = df.fillna(0)
df.isnull().sum()

Date                0
MMM                 0
MMM_HL_PCT_DIFF     0
MMM_PCT_CHNG        0
ABT                 0
                   ..
ZION_HL_PCT_DIFF    0
ZION_PCT_CHNG       0
ZTS                 0
ZTS_HL_PCT_DIFF     0
ZTS_PCT_CHNG        0
Length: 1516, dtype: int64

In [7]:
df.dtypes

Date                 object
MMM                 float64
MMM_HL_PCT_DIFF     float64
MMM_PCT_CHNG        float64
ABT                 float64
                     ...   
ZION_HL_PCT_DIFF    float64
ZION_PCT_CHNG       float64
ZTS                 float64
ZTS_HL_PCT_DIFF     float64
ZTS_PCT_CHNG        float64
Length: 1516, dtype: object

In [8]:
df = df.astype({'Date':'datetime64'})
df.dtypes

Date                datetime64[ns]
MMM                        float64
MMM_HL_PCT_DIFF            float64
MMM_PCT_CHNG               float64
ABT                        float64
                         ...      
ZION_HL_PCT_DIFF           float64
ZION_PCT_CHNG              float64
ZTS                        float64
ZTS_HL_PCT_DIFF            float64
ZTS_PCT_CHNG               float64
Length: 1516, dtype: object

In [9]:
start = dt.datetime(2010, 1, 1)
end = dt.datetime.now()

GSPC = web.DataReader('^GSPC', 'yahoo', start, end)
GSPC = GSPC.reset_index()
GSPC['GSPC_HL_PCT_DIFF'] = (GSPC['High'] - GSPC['Low']) / GSPC['Low']
GSPC['GSPC_PCT_CHNG'] = (GSPC['Close'] - GSPC['Open']) / GSPC['Open']
GSPC = GSPC.rename(columns = {'Adj Close':'GSPC'})
GSPC = GSPC[['Date', 'GSPC', 'GSPC_HL_PCT_DIFF', 'GSPC_PCT_CHNG']]
GSPC

Unnamed: 0,Date,GSPC,GSPC_HL_PCT_DIFF,GSPC_PCT_CHNG
0,2010-01-04,1132.989990,0.015503,0.014715
1,2010-01-05,1136.520020,0.006170,0.003408
2,2010-01-06,1137.140015,0.004621,0.001259
3,2010-01-07,1141.689941,0.009847,0.004770
4,2010-01-08,1144.979980,0.008071,0.003910
...,...,...,...,...
2679,2020-08-25,3443.620117,0.005362,0.002232
2680,2020-08-26,3478.729980,0.010720,0.008336
2681,2020-08-27,3484.550049,0.009523,-0.000169
2682,2020-08-28,3508.010010,0.007149,0.003812


In [10]:
df

Unnamed: 0,Date,MMM,MMM_HL_PCT_DIFF,MMM_PCT_CHNG,ABT,ABT_HL_PCT_DIFF,ABT_PCT_CHNG,ABBV,ABBV_HL_PCT_DIFF,ABBV_PCT_CHNG,...,ZBRA_PCT_CHNG,ZBH,ZBH_HL_PCT_DIFF,ZBH_PCT_CHNG,ZION,ZION_HL_PCT_DIFF,ZION_PCT_CHNG,ZTS,ZTS_HL_PCT_DIFF,ZTS_PCT_CHNG
0,2010-01-04,62.327927,0.009435,-0.000842,18.082745,0.011869,0.004982,0.000000,0.000000,0.000000,...,0.007025,55.568928,0.013994,0.011971,11.827960,0.038043,0.030139,0.000000,0.000000,0.000000
1,2010-01-05,61.937550,0.018727,-0.003623,17.936651,0.013395,-0.008261,0.000000,0.000000,0.000000,...,0.002452,57.328018,0.021929,0.009785,12.244997,0.045352,0.039157,0.000000,0.000000,0.000000
2,2010-01-06,62.815929,0.013052,-0.002504,18.036259,0.010028,0.007045,0.000000,0.000000,0.000000,...,0.036118,57.309509,0.010574,-0.002578,13.309779,0.098829,0.097293,0.000000,0.000000,0.000000
3,2010-01-07,62.860950,0.019971,0.004921,18.185675,0.013131,0.008470,0.000000,0.000000,0.000000,...,-0.019129,58.624191,0.024775,0.007478,14.800472,0.137124,0.114228,0.000000,0.000000,0.000000
4,2010-01-08,63.303928,0.012245,0.007528,18.278645,0.010424,0.005296,0.000000,0.000000,0.000000,...,0.000725,57.392849,0.028478,0.002912,14.560900,0.073454,0.013589,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2677,2020-08-21,161.710007,0.012131,0.004847,102.400002,0.016858,0.007577,94.860001,0.010602,-0.003571,...,-0.007498,136.160004,0.017463,-0.004533,31.590000,0.031918,-0.015274,158.369995,0.012914,-0.002457
2678,2020-08-24,163.529999,0.017375,0.012256,101.389999,0.025124,-0.012756,94.510002,0.011255,-0.004110,...,-0.014246,136.809998,0.024320,-0.002261,33.040001,0.048193,0.041943,156.759995,0.022662,-0.018717
2679,2020-08-25,163.100006,0.017424,-0.010676,102.470001,0.013593,0.004805,94.059998,0.009371,-0.001592,...,0.005937,137.600006,0.015128,-0.004846,33.060001,0.039336,-0.011364,158.580002,0.014236,0.003734
2680,2020-08-26,163.899994,0.019049,0.006324,103.190002,0.020047,0.011766,94.349998,0.015817,0.009091,...,0.023471,136.830002,0.013988,-0.001824,32.279999,0.027890,-0.026831,159.759995,0.020290,0.013834


In [11]:
GSPC.dtypes

Date                datetime64[ns]
GSPC                       float64
GSPC_HL_PCT_DIFF           float64
GSPC_PCT_CHNG              float64
dtype: object

In [12]:
df = pd.merge(df, GSPC, how = 'left', on = 'Date')
df.head()

Unnamed: 0,Date,MMM,MMM_HL_PCT_DIFF,MMM_PCT_CHNG,ABT,ABT_HL_PCT_DIFF,ABT_PCT_CHNG,ABBV,ABBV_HL_PCT_DIFF,ABBV_PCT_CHNG,...,ZBH_PCT_CHNG,ZION,ZION_HL_PCT_DIFF,ZION_PCT_CHNG,ZTS,ZTS_HL_PCT_DIFF,ZTS_PCT_CHNG,GSPC,GSPC_HL_PCT_DIFF,GSPC_PCT_CHNG
0,2010-01-04,62.327927,0.009435,-0.000842,18.082745,0.011869,0.004982,0.0,0.0,0.0,...,0.011971,11.82796,0.038043,0.030139,0.0,0.0,0.0,1132.98999,0.015503,0.014715
1,2010-01-05,61.93755,0.018727,-0.003623,17.936651,0.013395,-0.008261,0.0,0.0,0.0,...,0.009785,12.244997,0.045352,0.039157,0.0,0.0,0.0,1136.52002,0.00617,0.003408
2,2010-01-06,62.815929,0.013052,-0.002504,18.036259,0.010028,0.007045,0.0,0.0,0.0,...,-0.002578,13.309779,0.098829,0.097293,0.0,0.0,0.0,1137.140015,0.004621,0.001259
3,2010-01-07,62.86095,0.019971,0.004921,18.185675,0.013131,0.00847,0.0,0.0,0.0,...,0.007478,14.800472,0.137124,0.114228,0.0,0.0,0.0,1141.689941,0.009847,0.00477
4,2010-01-08,63.303928,0.012245,0.007528,18.278645,0.010424,0.005296,0.0,0.0,0.0,...,0.002912,14.5609,0.073454,0.013589,0.0,0.0,0.0,1144.97998,0.008071,0.00391


In [13]:
df.tail()

Unnamed: 0,Date,MMM,MMM_HL_PCT_DIFF,MMM_PCT_CHNG,ABT,ABT_HL_PCT_DIFF,ABT_PCT_CHNG,ABBV,ABBV_HL_PCT_DIFF,ABBV_PCT_CHNG,...,ZBH_PCT_CHNG,ZION,ZION_HL_PCT_DIFF,ZION_PCT_CHNG,ZTS,ZTS_HL_PCT_DIFF,ZTS_PCT_CHNG,GSPC,GSPC_HL_PCT_DIFF,GSPC_PCT_CHNG
2677,2020-08-21,161.710007,0.012131,0.004847,102.400002,0.016858,0.007577,94.860001,0.010602,-0.003571,...,-0.004533,31.59,0.031918,-0.015274,158.369995,0.012914,-0.002457,3397.159912,0.006111,0.003293
2678,2020-08-24,163.529999,0.017375,0.012256,101.389999,0.025124,-0.012756,94.510002,0.011255,-0.00411,...,-0.002261,33.040001,0.048193,0.041943,156.759995,0.022662,-0.018717,3431.280029,0.005555,0.003859
2679,2020-08-25,163.100006,0.017424,-0.010676,102.470001,0.013593,0.004805,94.059998,0.009371,-0.001592,...,-0.004846,33.060001,0.039336,-0.011364,158.580002,0.014236,0.003734,3443.620117,0.005362,0.002232
2680,2020-08-26,163.899994,0.019049,0.006324,103.190002,0.020047,0.011766,94.349998,0.015817,0.009091,...,-0.001824,32.279999,0.02789,-0.026831,159.759995,0.02029,0.013834,3478.72998,0.01072,0.008336
2681,2020-08-27,163.300003,0.012832,-0.007295,110.900398,0.038315,-0.000987,94.5,0.014839,0.001802,...,0.009002,33.279999,0.038069,0.029384,159.792404,0.012385,-0.001297,3484.550049,0.009523,-0.000169


In [14]:
df.columns

Index(['Date', 'MMM', 'MMM_HL_PCT_DIFF', 'MMM_PCT_CHNG', 'ABT',
       'ABT_HL_PCT_DIFF', 'ABT_PCT_CHNG', 'ABBV', 'ABBV_HL_PCT_DIFF',
       'ABBV_PCT_CHNG',
       ...
       'ZBH_PCT_CHNG', 'ZION', 'ZION_HL_PCT_DIFF', 'ZION_PCT_CHNG', 'ZTS',
       'ZTS_HL_PCT_DIFF', 'ZTS_PCT_CHNG', 'GSPC', 'GSPC_HL_PCT_DIFF',
       'GSPC_PCT_CHNG'],
      dtype='object', length=1519)

In [15]:
#df.set_index('Date', inplace = True)
#df.plot(subplots = True, sharex = True, sharey = True)
#plt.show()

In [16]:
new_columns = ['Date', 'GSPC']
for tick in tickers:
    new_columns.append(tick)
df_Stocks = df[[c for c in new_columns]]
df_Stocks.head()

Unnamed: 0,Date,GSPC,MMM,ABT,ABBV,ABMD,ACN,ATVI,ADBE,AMD,...,WYNN,XEL,XRX,XLNX,XYL,YUM,ZBRA,ZBH,ZION,ZTS
0,2010-01-04,1132.98999,62.327927,18.082745,0.0,8.74,33.595993,10.126624,37.09,9.7,...,41.963718,14.312444,18.972462,19.886999,0.0,20.419832,28.67,55.568928,11.82796,0.0
1,2010-01-05,1136.52002,61.93755,17.936651,0.0,8.53,33.803631,10.144546,37.700001,9.71,...,44.515926,14.142703,18.99445,19.63625,0.0,20.349995,28.620001,57.328018,12.244997,0.0
2,2010-01-06,1137.140015,62.815929,18.036259,0.0,8.4,34.162979,10.090778,37.619999,9.57,...,43.932011,14.169861,18.818563,19.503048,0.0,20.204515,28.4,57.309509,13.309779,0.0
3,2010-01-07,1141.689941,62.86095,18.185675,0.0,8.4,34.131035,9.848815,36.889999,9.47,...,44.870213,14.108753,18.906506,19.307158,0.0,20.198702,27.690001,58.624191,14.800472,0.0
4,2010-01-08,1144.97998,63.303928,18.278645,0.0,8.23,33.995285,9.768159,36.689999,9.43,...,44.548744,14.115545,18.840551,19.589245,0.0,20.204515,27.6,57.392849,14.5609,0.0


In [17]:
df_Stocks.tail()

Unnamed: 0,Date,GSPC,MMM,ABT,ABBV,ABMD,ACN,ATVI,ADBE,AMD,...,WYNN,XEL,XRX,XLNX,XYL,YUM,ZBRA,ZBH,ZION,ZTS
2677,2020-08-21,3397.159912,161.710007,102.400002,94.860001,307.570007,237.490005,83.209999,473.220001,83.809998,...,82.790001,69.610001,18.76,102.0,79.144943,95.63224,275.329987,136.160004,31.59,158.369995
2678,2020-08-24,3431.280029,163.529999,101.389999,94.510002,304.950012,237.75,83.099998,476.299988,83.080002,...,86.790001,70.410004,19.299999,102.470001,81.138519,96.089996,275.390015,136.809998,33.040001,156.759995
2679,2020-08-25,3443.620117,163.100006,102.470001,94.059998,311.119995,238.210007,83.620003,484.429993,86.349998,...,86.209999,68.970001,18.91,103.389999,80.68,96.480003,277.859985,137.600006,33.060001,158.580002
2680,2020-08-26,3478.72998,163.899994,103.190002,94.349998,306.140015,240.449997,84.480003,528.48999,86.019997,...,84.800003,68.129997,18.790001,104.980003,80.699997,95.330002,284.73999,136.830002,32.279999,159.759995
2681,2020-08-27,3484.550049,163.300003,110.900398,94.5,311.529999,240.845001,84.110001,509.410004,84.214996,...,86.089996,68.239998,18.889999,103.360001,81.125,95.809998,276.029999,138.990005,33.279999,159.792404
