# - AWS Machine Learning Nanodegree Capstone Project

## Summary

### In this notebook I will import the data, create a pipeline, explore the data, and train and test one or more models. I will then evaluate the results

In [12]:
#data import
import pandas as pd
import os
import glob

## Data import

### In this step I am importing data into Pandas dataframes, merging the various features into a single frame and resetting the index

In [14]:
data_path = os.path.join(os.getcwd(), 'use_this_data')

In [27]:
wl_path = os.path.join(data_path, 'watchlist_01012022_10312022')
rat_path = os.path.join(data_path, 'ratios_01012022_10312022')
sq_path = os.path.join(data_path, 'stockquotes_01012022_10312022')
ss_path = os.path.join(data_path, 'shortsales_01012022_10312022')

In [36]:
wl = pd.read_parquet(wl_path, columns=['Security Code Clean', 'file_date'])
rat = pd.read_parquet(rat_path, columns=['Security Code Clean', 'file_date', 'P/E ratio', 'P/B ratio'])
sq = pd.read_parquet(sq_path,  columns=['Security Code Clean', 'file_date', 'Trade Volume', 'Closing Price', 'Change'])
ss = pd.read_parquet(ss_path, columns=['Security Code Clean', 'file_date', 'Trading Volume', 'Trading Value'])

In [45]:
sq

Unnamed: 0,Security Code Clean,file_date,Trade Volume,Closing Price,Change
0,0050,2022-01-03,7064552,146.40,0.90
1,0051,2022-01-03,135357,60.90,0.10
2,0052,2022-01-03,882163,135.20,1.05
3,0053,2022-01-03,28349,70.45,0.50
4,0054,2022-01-03,3998,31.80,0.12
...,...,...,...,...,...
1171,9944,2022-10-31,71751,20.40,0.05
1172,9945,2022-10-31,6956898,39.60,0.40
1173,9946,2022-10-31,11298,11.55,0.05
1174,9955,2022-10-31,131905,17.05,0.25


In [57]:
#add indicator column for whether a stock is on the list
wl['on_watchlist'] = 1

In [58]:
wl

Unnamed: 0,Security Code Clean,file_date,on_watchlist
0,1213,2022-01-03,1
1,1418,2022-01-03,1
2,1472,2022-01-03,1
3,1512,2022-01-03,1
4,1538,2022-01-03,1
...,...,...,...
8,3043,2022-10-31,1
9,3536,2022-10-31,1
10,6225,2022-10-31,1
11,8101,2022-10-31,1


In [39]:
rat

Unnamed: 0,Security Code Clean,file_date,P/E ratio,P/B ratio
0,1101,2022-01-03,13.91,1.49
1,1102,2022-01-03,10.09,1.03
2,1103,2022-01-03,7.29,0.51
3,1104,2022-01-03,12.78,0.75
4,1108,2022-01-03,20.52,1.06
...,...,...,...,...
960,9944,2022-10-31,4.11,0.66
961,9945,2022-10-31,5.34,2.96
962,9946,2022-10-31,18.33,0.59
963,9955,2022-10-31,,1.21


In [41]:
ss.rename(columns={'Trading Volume':'ss_trading_vol', 'Trading Value':'ss_trading_value'}, inplace=True)

In [47]:
ss

Unnamed: 0,Security Code Clean,file_date,ss_trading_vol,ss_trading_value
0,0050,2022-01-03,2,292950
1,0051,2022-01-03,0,0
2,0052,2022-01-03,0,0
3,0053,2022-01-03,0,0
4,0054,2022-01-03,0,0
...,...,...,...,...
1118,9944,2022-10-31,0,0
1119,9945,2022-10-31,173,6891050
1120,9946,2022-10-31,0,0
1121,9955,2022-10-31,0,0


In [59]:
all1 = pd.merge(left=sq, right=ss, how='left', on=['Security Code Clean', 'file_date'], suffixes=('_sq','_ss'))

In [60]:
all2 = pd.merge(left=all1, right=rat, how='left', on=['Security Code Clean', 'file_date'], suffixes=('_sq_ss', '_rat'))

In [61]:
all = pd.merge(left=all2, right=wl, how='left', on=['Security Code Clean', 'file_date'], suffixes=('_sq_ss_rat', '_wl'))

In [52]:
all.set_index('file_date', inplace=True)

In [67]:
#showing watchlist rows
all[all['on_watchlist']==1]

Unnamed: 0,Security Code Clean,file_date,Trade Volume,Closing Price,Change,ss_trading_vol,ss_trading_value,P/E ratio,P/B ratio,on_watchlist
171,1213,2022-01-03,2001,8.42,0.00,0,0,,1.04,1.0
221,1418,2022-01-03,71316,12.70,1.05,0,0,,3.80,1.0
257,1472,2022-01-03,2,,0.00,0,0,,3.42,1.0
267,1512,2022-01-03,354293,4.41,0.40,0,0,,12.60,1.0
290,1538,2022-01-03,59844,10.40,0.15,0,0,,2.36,1.0
...,...,...,...,...,...,...,...,...,...,...
234859,3043,2022-10-31,204329,6.42,0.10,0,0,22.93,5.14,1.0
234922,3536,2022-10-31,6000,4.15,0.05,0,0,,7.03,1.0
235111,6225,2022-10-31,39220,6.30,0.12,0,0,,27.39,1.0
235207,8101,2022-10-31,75749,5.90,0.00,0,0,3.88,2.71,1.0


In [68]:
all

Unnamed: 0,Security Code Clean,file_date,Trade Volume,Closing Price,Change,ss_trading_vol,ss_trading_value,P/E ratio,P/B ratio,on_watchlist
0,0050,2022-01-03,7064552,146.40,0.90,2,292950,,,
1,0051,2022-01-03,135357,60.90,0.10,0,0,,,
2,0052,2022-01-03,882163,135.20,1.05,0,0,,,
3,0053,2022-01-03,28349,70.45,0.50,0,0,,,
4,0054,2022-01-03,3998,31.80,0.12,0,0,,,
...,...,...,...,...,...,...,...,...,...,...
235296,9944,2022-10-31,71751,20.40,0.05,0,0,4.11,0.66,
235297,9945,2022-10-31,6956898,39.60,0.40,173,6891050,5.34,2.96,
235298,9946,2022-10-31,11298,11.55,0.05,0,0,18.33,0.59,
235299,9955,2022-10-31,131905,17.05,0.25,0,0,,1.21,


In [70]:
#Investigate Nan - we can see multiple Nans indicating that not every stock has a value for all columns
all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235301 entries, 0 to 235300
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Security Code Clean  235301 non-null  object        
 1   file_date            235301 non-null  datetime64[ns]
 2   Trade Volume         235301 non-null  Int64         
 3   Closing Price        232340 non-null  float64       
 4   Change               235301 non-null  float64       
 5   ss_trading_vol       224945 non-null  Int64         
 6   ss_trading_value     224945 non-null  Int64         
 7   P/E ratio            168174 non-null  float64       
 8   P/B ratio            194346 non-null  float64       
 9   on_watchlist         3440 non-null    float64       
dtypes: Int64(3), datetime64[ns](1), float64(5), object(1)
memory usage: 20.4+ MB


In [55]:
#count of nulls per column
print(all.isnull().sum())

Security Code Clean        0
Trade Volume               0
Closing Price           2961
Change                     0
ss_trading_vol         10356
ss_trading_value       10356
P/E ratio              67127
P/B ratio              40955
dtype: int64


## Export data to prepare for Amazon Forecast

In [78]:
#save target timeseries (watchlist)
target = all[['file_date','Security Code Clean','on_watchlist']]

In [79]:
target

Unnamed: 0,file_date,Security Code Clean,on_watchlist
0,2022-01-03,0050,
1,2022-01-03,0051,
2,2022-01-03,0052,
3,2022-01-03,0053,
4,2022-01-03,0054,
...,...,...,...
235296,2022-10-31,9944,
235297,2022-10-31,9945,
235298,2022-10-31,9946,
235299,2022-10-31,9955,


In [80]:
target.to_parquet('forecast_target.parquet')

In [81]:
rel_stockquote = all[['file_date','Security Code Clean','Closing Price']]

In [82]:
rel_stockquote

Unnamed: 0,file_date,Security Code Clean,Closing Price
0,2022-01-03,0050,146.40
1,2022-01-03,0051,60.90
2,2022-01-03,0052,135.20
3,2022-01-03,0053,70.45
4,2022-01-03,0054,31.80
...,...,...,...
235296,2022-10-31,9944,20.40
235297,2022-10-31,9945,39.60
235298,2022-10-31,9946,11.55
235299,2022-10-31,9955,17.05


In [83]:
rel_stockquote.to_parquet('stockquote_rel.parquet')

In [84]:
rel_ratios = all[['file_date','Security Code Clean','P/E ratio','P/B ratio']]

In [87]:
rel_ratios

Unnamed: 0,file_date,Security Code Clean,P/E ratio,P/B ratio
0,2022-01-03,0050,,
1,2022-01-03,0051,,
2,2022-01-03,0052,,
3,2022-01-03,0053,,
4,2022-01-03,0054,,
...,...,...,...,...
235296,2022-10-31,9944,4.11,0.66
235297,2022-10-31,9945,5.34,2.96
235298,2022-10-31,9946,18.33,0.59
235299,2022-10-31,9955,,1.21


In [88]:
rel_ratios.to_parquet('ratios_rel.parquet')

In [89]:
rel_shortsales = all[['file_date','Security Code Clean','ss_trading_vol','ss_trading_value']]

In [90]:
rel_shortsales.to_parquet('shortsales_rel.parquet')