## Data Wrangling

In [1]:
# importing libraries
import pandas as pd
import pandas_ta as ta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('stock_market_data/symbols_valid_meta.csv')

In [3]:
df.head()

Unnamed: 0,Nasdaq Traded,Symbol,Security Name,Listing Exchange,Market Category,ETF,Round Lot Size,Test Issue,Financial Status,CQS Symbol,NASDAQ Symbol,NextShares
0,Y,A,"Agilent Technologies, Inc. Common Stock",N,,N,100.0,N,,A,A,N
1,Y,AA,Alcoa Corporation Common Stock,N,,N,100.0,N,,AA,AA,N
2,Y,AAAU,Perth Mint Physical Gold ETF,P,,Y,100.0,N,,AAAU,AAAU,N
3,Y,AACG,ATA Creativity Global - American Depositary Sh...,Q,G,N,100.0,N,N,,AACG,N
4,Y,AADR,AdvisorShares Dorsey Wright ADR ETF,P,,Y,100.0,N,,AADR,AADR,N


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8049 entries, 0 to 8048
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Nasdaq Traded     8049 non-null   object 
 1   Symbol            8049 non-null   object 
 2   Security Name     8049 non-null   object 
 3   Listing Exchange  8049 non-null   object 
 4   Market Category   8049 non-null   object 
 5   ETF               8049 non-null   object 
 6   Round Lot Size    8049 non-null   float64
 7   Test Issue        8049 non-null   object 
 8   Financial Status  3383 non-null   object 
 9   CQS Symbol        4666 non-null   object 
 10  NASDAQ Symbol     8049 non-null   object 
 11  NextShares        8049 non-null   object 
dtypes: float64(1), object(11)
memory usage: 754.7+ KB


In [5]:
assert len(df) == df['Symbol'].nunique()
num_symbols_in_meta_csv = len(df)
print('number of stocks:',num_symbols_in_meta_csv)

number of stocks: 8049


In [6]:
num_in_stock_folder = 5884 # counted from folder
num_in_etf_folder = 2165

# checking that all stocks and etfs are accounted for in my dataset
num_in_stock_folder + num_in_etf_folder == num_symbols_in_meta_csv

True

## Examining single stock structure

In [7]:
apple = pd.read_csv('stock_market_data/stocks/AAPL.csv')
apple.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
9904,2020-03-26,246.520004,258.679993,246.360001,258.440002,258.440002,63021800
9905,2020-03-27,252.75,255.869995,247.050003,247.740005,247.740005,51054200
9906,2020-03-30,250.740005,255.520004,249.399994,254.809998,254.809998,41994100
9907,2020-03-31,255.600006,262.48999,252.0,254.289993,254.289993,49250500
9908,2020-04-01,246.5,248.720001,239.130005,240.910004,240.910004,43956200


In [8]:
apple.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9909 entries, 0 to 9908
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       9909 non-null   object 
 1   Open       9909 non-null   float64
 2   High       9909 non-null   float64
 3   Low        9909 non-null   float64
 4   Close      9909 non-null   float64
 5   Adj Close  9909 non-null   float64
 6   Volume     9909 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 542.0+ KB


In [9]:
# converting Date column to datetime object
apple['Date'] = pd.to_datetime(apple['Date'])

In [10]:
# Setting index to be the date column
apple.set_index('Date', inplace=True)

In [15]:
apple.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_50,SMA_200,EMA_8,EMA_21,RSI_14,BBL,BBM,BBU,BBB,BBP,MACD,MACD_H,MACD_S
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-03-26,246.520004,258.679993,246.360001,258.440002,258.440002,63021800,294.651,249.4761,248.478666,263.147143,46.699409,218.370454,264.5145,310.658547,34.889616,0.434179,-14.668477,2.307545,-16.976021
2020-03-27,252.75,255.869995,247.050003,247.740005,247.740005,51054200,293.379,249.74385,248.314519,261.746494,43.697626,216.72181,263.233501,309.745193,35.338732,0.333445,-13.431974,2.835238,-16.267212
2020-03-30,250.740005,255.520004,249.399994,254.809998,254.809998,41994100,292.1704,250.04715,249.757959,261.115903,46.160211,217.38684,261.033501,304.680163,33.441425,0.428706,-11.357944,3.927414,-15.285358
2020-03-31,255.600006,262.48999,252.0,254.289993,254.289993,49250500,290.8816,250.3549,250.765078,260.495366,46.000842,217.546767,259.282001,301.017235,32.192928,0.440194,-9.730288,4.444056,-14.174344
2020-04-01,246.5,248.720001,239.130005,240.910004,240.910004,43956200,289.3684,250.59,248.575061,258.714878,41.984273,218.862403,256.190501,293.5186,29.140892,0.295322,-10.139817,3.227622,-13.367439


In [12]:
# Creating pandas_ta strategy to get more metrics

# I can change these later to get different metrics
MyStrategy = ta.Strategy(
    name="SMAs, EMAs, rsi, BBs, and MACD",
    ta=[
        {"kind": "sma", "length": 50}, #simple moving average
        {"kind": "sma", "length": 200},
        {"kind": "ema", "length": 8},  #exponential moving average
        {"kind": "ema", "length": 21},
        {"kind": "rsi"},
        {"kind": "bbands", "length": 20, "col_names": ("BBL", "BBM", "BBU", "BBB", "BBP")},
        {"kind": "macd", "fast": 8, "slow": 21, "col_names": ("MACD", "MACD_H", "MACD_S")}
    ]
)

apple.ta.strategy(MyStrategy)

In [13]:
apple.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SMA_50,SMA_200,EMA_8,EMA_21,RSI_14,BBL,BBM,BBU,BBB,BBP,MACD,MACD_H,MACD_S
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-03-26,246.520004,258.679993,246.360001,258.440002,258.440002,63021800,294.651,249.4761,248.478666,263.147143,46.699409,218.370454,264.5145,310.658547,34.889616,0.434179,-14.668477,2.307545,-16.976021
2020-03-27,252.75,255.869995,247.050003,247.740005,247.740005,51054200,293.379,249.74385,248.314519,261.746494,43.697626,216.72181,263.233501,309.745193,35.338732,0.333445,-13.431974,2.835238,-16.267212
2020-03-30,250.740005,255.520004,249.399994,254.809998,254.809998,41994100,292.1704,250.04715,249.757959,261.115903,46.160211,217.38684,261.033501,304.680163,33.441425,0.428706,-11.357944,3.927414,-15.285358
2020-03-31,255.600006,262.48999,252.0,254.289993,254.289993,49250500,290.8816,250.3549,250.765078,260.495366,46.000842,217.546767,259.282001,301.017235,32.192928,0.440194,-9.730288,4.444056,-14.174344
2020-04-01,246.5,248.720001,239.130005,240.910004,240.910004,43956200,289.3684,250.59,248.575061,258.714878,41.984273,218.862403,256.190501,293.5186,29.140892,0.295322,-10.139817,3.227622,-13.367439


In [16]:
# selecting 5 year period
apple_2014_2019 = apple['2014-01-01':'2019-01-01']
apple_2014_2019.isna().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
SMA_50       0
SMA_200      0
EMA_8        0
EMA_21       0
RSI_14       0
BBL          0
BBM          0
BBU          0
BBB          0
BBP          0
MACD         0
MACD_H       0
MACD_S       0
dtype: int64

In [17]:
apple_2014_2019.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SMA_50,SMA_200,EMA_8,EMA_21,RSI_14,BBL,BBM,BBU,BBB,BBP,MACD,MACD_H,MACD_S
count,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0,1258.0
mean,131.230753,132.355248,130.079289,131.241425,124.526211,42935480.0,129.389765,121.304006,131.025825,130.558178,54.982756,124.165809,130.637489,137.10917,9.857158,0.566792,0.467647,-0.031051,0.498698
std,37.435126,37.771772,37.064776,37.414122,38.509239,23583730.0,37.061072,33.065617,37.367623,37.231992,13.462587,35.524672,37.391721,39.600945,4.994,0.337059,2.666235,0.86294,2.42862
min,70.739998,71.647141,70.507141,71.397141,64.096725,11475900.0,75.462485,67.766314,73.31552,75.06619,21.02493,69.386872,74.581356,76.787534,3.001578,-0.326543,-12.523435,-3.226471,-10.625773
25%,102.9,104.374998,102.440002,103.037502,95.870705,26606250.0,101.823,102.51695,102.111971,101.807751,45.68175,96.654371,101.775,109.331241,6.143534,0.293125,-0.637011,-0.513873,-0.496155
50%,118.940002,119.889999,118.215,119.18,111.538467,36983700.0,116.0371,114.912525,118.138948,117.390898,54.491624,112.53248,117.83275,124.058052,8.457768,0.614145,0.576596,0.004516,0.640317
75%,158.522499,159.990002,156.7075,157.830002,153.137512,52347400.0,157.321301,143.226287,158.336928,157.32163,64.57566,150.55898,158.164,164.368835,12.687723,0.832322,2.011765,0.490385,1.964875
max,230.779999,233.470001,229.779999,232.070007,227.300339,266380800.0,221.320001,194.559051,226.758514,223.764704,90.40216,215.615884,223.456001,232.200241,31.054938,1.350058,7.339934,3.3578,6.586112
