<a href="https://colab.research.google.com/github/benchov/Machine_Learning_for_Trading_Knowledge/blob/main/PCA_and_Dimensionality_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
!pip install ta
!pip install --upgrade pandas
!pip install --upgrade pandas-datareader
!pip install yfinance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [49]:
# Remove unwanted warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

# Data Management
import pandas as pd
import numpy as np
from pandas_datareader.data import DataReader
from ta import add_all_ta_features
import yfinance as yf

# Statistics
from statsmodels.tsa.stattools import adfuller

# Unsupervised Machine Learning
from sklearn.decomposition import PCA

# Supervised Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Reporting
import matplotlib.pyplot as plt

In [50]:
# Data Extraction|
start_date = '2017-01-01'
end_date = '2022-06-01'
symbol = "^VIX"
df = yf.download(symbol, start=start_date, end=end_date)
df.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,14.07,14.07,12.85,12.85,12.85,0
2017-01-04,12.78,12.8,11.63,11.85,11.85,0
2017-01-05,11.96,12.09,11.4,11.67,11.67,0
2017-01-06,11.7,11.74,10.98,11.32,11.32,0
2017-01-09,11.71,12.08,11.46,11.56,11.56,0


In [51]:
# add TA features
df = add_all_ta_features(df, open="Open", high='High', low="Low", close='Adj Close', volume="Volume", fillna=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1362 entries, 2017-01-03 to 2022-05-31
Data columns (total 92 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Open                       1362 non-null   float64
 1   High                       1362 non-null   float64
 2   Low                        1362 non-null   float64
 3   Close                      1362 non-null   float64
 4   Adj Close                  1362 non-null   float64
 5   Volume                     1362 non-null   int64  
 6   volume_adi                 1362 non-null   float64
 7   volume_obv                 1362 non-null   int64  
 8   volume_cmf                 1362 non-null   float64
 9   volume_fi                  1362 non-null   float64
 10  volume_em                  1362 non-null   float64
 11  volume_sma_em              1362 non-null   float64
 12  volume_vpt                 1362 non-null   float64
 13  volume_vwap                136

In [52]:
# data preprocessing - stationarity
non_stationaries = []
for col in df.columns:
  dftest = adfuller(df[col].values)
  p_value = dftest[1]
  t_test = dftest[0] < dftest[4]['1%']
  if p_value > 0.05 or not t_test:
    non_stationaries.append(col)
print(f"Non-Stationary Features Found: {len(non_stationaries)}")

Non-Stationary Features Found: 29


In [53]:
# convert non-stationary to stationary
df_stationary = df.copy()
df_stationary[non_stationaries] = df_stationary[non_stationaries].pct_change()
df_stationary = df_stationary.iloc[1:]
df_stationary.shape

(1361, 92)

In [54]:
# drop NA cols
na_list = df_stationary.columns[df_stationary.isna().any().to_list()]
df_stationary.drop(columns=na_list, inplace=True)
df_stationary.shape

(1361, 80)

In [55]:
# handle infinite values
df_stationary.replace([np.inf, -np.inf], 0, inplace=True)
df_stationary.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,volume_mfi,volume_nvi,volatility_bbm,volatility_bbh,volatility_bbl,...,momentum_wr,momentum_ao,momentum_roc,momentum_ppo,momentum_ppo_signal,momentum_ppo_hist,momentum_kama,others_dr,others_dlr,others_cr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-04,12.78,12.8,-0.094942,11.85,11.85,0.0,0.0,12.35,0.038911,-0.116732,...,-90.983594,0.0,0.0,-0.624394,-0.124879,-0.499515,-0.058492,-7.782101,-8.101594,-7.782101
2017-01-05,11.96,12.09,-0.019776,11.67,11.67,0.0,0.0,12.123334,-0.014123,-0.023329,...,-89.887624,0.0,0.0,-1.226732,-0.345249,-0.881483,-0.022532,-1.51899,-1.530645,-9.182881
2017-01-06,11.7,11.74,-0.036842,11.32,11.32,0.0,0.0,11.9225,-0.007763,-0.027018,...,-88.996759,0.0,0.0,-1.916831,-0.659566,-1.257265,-0.027227,-2.999146,-3.045041,-11.90662
2017-01-09,11.71,12.08,0.043716,11.56,11.56,0.0,0.0,11.85,-0.011637,0.000646,...,-81.229746,0.0,0.0,-2.289756,-0.985604,-1.304152,0.002666,2.120148,2.097985,-10.03891
2017-01-10,11.59,11.79,-0.013089,11.49,11.49,0.0,0.0,11.79,-0.00895,-0.000415,...,-83.495139,-0.288667,0.0,-2.607109,-1.309905,-1.297204,-0.002328,-0.605542,-0.607383,-10.583662


In [56]:
# set target
df_stationary["Target"] = -1
df_stationary.loc[df_stationary["Adj Close"].shift(-1) > df_stationary["Adj Close"], 'Target'] = 1
df_stationary.dropna(inplace=True)

In [57]:
# split target from feature set
X = df_stationary.iloc[:,:-1]
y = df_stationary.iloc[:, -1]

In [59]:
# feature scaling
df_sc = df_stationary.copy()
X_fs = StandardScaler().fit_transform(X)

In [60]:
# create train test split
X_train, y_train, X_test, y_test = train_test_split(X_fs, y, test_size=0.7, random_state=883)