In [21]:
# install the xgboost
!pip install xgboost



Read the processed data  (stored in the parquet format)

In [22]:
# importing the core libraries
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [23]:
# Read the parquet file
main_table = pq.read_table("../data/processed/price_features.parquet")

# Convert to pandas dataframe
main_df = main_table.to_pandas()

# display the df
main_df

Unnamed: 0,date,symbol,open,close,low,high,volume,close_lag1,log_return_1d,target_next_day_return,...,rolling_mean_5,rolling_mean_10,rolling_std_5,rolling_std_10,momentum_5,momentum_10,volume_change_1d,volume_zscore_10,high_low_ratio,close_open_ratio
0,2010-01-19,A,21.716737,22.031474,21.709585,22.052933,3563600.0,21.766810,0.012086,-0.005535,...,21.962804,22.014306,0.210961,0.153942,0.003127,0.000780,-0.229758,-0.405386,1.015815,1.014493
1,2010-01-20,A,21.838340,21.909872,21.595137,21.938484,4589000.0,22.031474,-0.005535,-0.003598,...,21.988555,21.990701,0.190062,0.149553,-0.003578,-0.003676,0.287743,0.571001,1.015899,1.003275
2,2010-01-21,A,22.174536,21.831188,21.587982,22.253220,6081400.0,21.909872,-0.003598,-0.045241,...,21.964235,21.967096,0.203115,0.154671,-0.006057,-0.006187,0.325212,1.536435,1.030815,0.984516
3,2010-01-22,A,21.709585,20.865522,20.808298,21.709585,4263000.0,21.831188,-0.045241,0.009553,...,21.680973,21.849786,0.466406,0.378012,-0.037611,-0.045047,-0.299010,-0.134568,1.043314,0.961120
4,2010-01-25,A,21.044350,21.065809,20.908441,21.208870,3608500.0,20.865522,0.009553,-0.005448,...,21.540773,21.753219,0.534536,0.444022,-0.022050,-0.031600,-0.153530,-0.719029,1.014369,1.001020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845744,2016-12-22,ZTS,52.669998,53.259998,52.669998,53.369999,1515800.0,53.119999,0.002632,0.009716,...,52.865999,52.134999,0.408019,0.928706,0.007453,0.021579,-0.449720,-1.441778,1.013290,1.011202
845745,2016-12-23,ZTS,53.200001,53.779999,53.180000,53.779999,1942400.0,53.259998,0.009716,-0.001116,...,53.129999,52.323999,0.496991,1.056789,0.012234,0.027827,0.281436,-0.983389,1.011282,1.010902
845746,2016-12-27,ZTS,53.779999,53.720001,53.650002,54.150002,1345000.0,53.779999,-0.001116,-0.005226,...,53.395999,52.603999,0.329667,1.013511,0.006068,0.021215,-0.307558,-1.216227,1.009320,0.998884
845747,2016-12-28,ZTS,53.759998,53.439999,53.349998,53.849998,1241900.0,53.720001,-0.005226,0.003363,...,53.463999,52.842999,0.285448,0.879243,-0.000449,0.011298,-0.076654,-1.202686,1.009372,0.994048


Creating the target column for xgboost model prediction: 1 if price goes up tomorrow, 0 otherwise

In [None]:
main_df["target"] = main_df['price_up_tomorrow']

# Drop last row (it has no tomorrow's price)
main_df = main_df.dropna()

main_df

Unnamed: 0,date,symbol,open,close,low,high,volume,close_lag1,log_return_1d,target_next_day_return,...,rolling_mean_10,rolling_std_5,rolling_std_10,momentum_5,momentum_10,volume_change_1d,volume_zscore_10,high_low_ratio,close_open_ratio,target
0,2010-01-19,A,21.716737,22.031474,21.709585,22.052933,3563600.0,21.766810,0.012086,-0.005535,...,22.014306,0.210961,0.153942,0.003127,0.000780,-0.229758,-0.405386,1.015815,1.014493,0
1,2010-01-20,A,21.838340,21.909872,21.595137,21.938484,4589000.0,22.031474,-0.005535,-0.003598,...,21.990701,0.190062,0.149553,-0.003578,-0.003676,0.287743,0.571001,1.015899,1.003275,0
2,2010-01-21,A,22.174536,21.831188,21.587982,22.253220,6081400.0,21.909872,-0.003598,-0.045241,...,21.967096,0.203115,0.154671,-0.006057,-0.006187,0.325212,1.536435,1.030815,0.984516,0
3,2010-01-22,A,21.709585,20.865522,20.808298,21.709585,4263000.0,21.831188,-0.045241,0.009553,...,21.849786,0.466406,0.378012,-0.037611,-0.045047,-0.299010,-0.134568,1.043314,0.961120,1
4,2010-01-25,A,21.044350,21.065809,20.908441,21.208870,3608500.0,20.865522,0.009553,-0.005448,...,21.753219,0.534536,0.444022,-0.022050,-0.031600,-0.153530,-0.719029,1.014369,1.001020,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
845744,2016-12-22,ZTS,52.669998,53.259998,52.669998,53.369999,1515800.0,53.119999,0.002632,0.009716,...,52.134999,0.408019,0.928706,0.007453,0.021579,-0.449720,-1.441778,1.013290,1.011202,1
845745,2016-12-23,ZTS,53.200001,53.779999,53.180000,53.779999,1942400.0,53.259998,0.009716,-0.001116,...,52.323999,0.496991,1.056789,0.012234,0.027827,0.281436,-0.983389,1.011282,1.010902,0
845746,2016-12-27,ZTS,53.779999,53.720001,53.650002,54.150002,1345000.0,53.779999,-0.001116,-0.005226,...,52.603999,0.329667,1.013511,0.006068,0.021215,-0.307558,-1.216227,1.009320,0.998884,0
845747,2016-12-28,ZTS,53.759998,53.439999,53.349998,53.849998,1241900.0,53.720001,-0.005226,0.003363,...,52.842999,0.285448,0.879243,-0.000449,0.011298,-0.076654,-1.202686,1.009372,0.994048,1


In [None]:
main_df.columns

  np.isinf(main_df[np.isinf(X_train).any(axis=1)])


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [28]:
# Select features and target
X = main_df[['open', 'close', 'low', 'high', 'volume',
       'close_lag1', 'log_return_1d', 'target_next_day_return',
       'log_return_lag1', 'log_return_lag2',
       'log_return_lag3', 'rolling_mean_5', 'rolling_mean_10', 'rolling_std_5',
       'rolling_std_10', 'momentum_5', 'momentum_10', 'volume_change_1d',
       'volume_zscore_10', 'high_low_ratio', 'close_open_ratio']]
y = main_df["target"]

Split into training and test sets

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False # time based so no random shuffles
    )

In [30]:
# create and train an XG Boost model
model = XGBClassifier(
    n_estimators = 100, # number of trees
    learning_rate = 0.1, # step size
    max_depth = 3, # how deep each tree can grow
    eval_metric = 'logloss' # evaluation metric for training
)

model.fit(X_train, y_train)

XGBoostError: [19:30:32] /workspace/src/data/../common/../data/gradient_index.h:94: Check failed: valid: Input data contains `inf` or a value too large, while `missing` is not set to `inf`
Stack trace:
  [bt] (0) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x25c1ac) [0x71ea0525c1ac]
  [bt] (1) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x55ac78) [0x71ea0555ac78]
  [bt] (2) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x55b7df) [0x71ea0555b7df]
  [bt] (3) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x54976d) [0x71ea0554976d]
  [bt] (4) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x54b99c) [0x71ea0554b99c]
  [bt] (5) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(+0x4fb3ea) [0x71ea054fb3ea]
  [bt] (6) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/site-packages/xgboost/lib/libxgboost.so(XGQuantileDMatrixCreateFromCallback+0x18c) [0x71ea05173a5c]
  [bt] (7) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/lib-dynload/../../libffi.so.8(+0xa052) [0x71eaa422d052]
  [bt] (8) /home/ashwinder/miniconda3/envs/dsi_participant/lib/python3.9/lib-dynload/../../libffi.so.8(+0x8925) [0x71eaa422b925]



In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5225591085669956

Confusion Matrix:
 [[18289 63924]
 [17314 70626]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.22      0.31     82213
           1       0.52      0.80      0.63     87940

    accuracy                           0.52    170153
   macro avg       0.52      0.51      0.47    170153
weighted avg       0.52      0.52      0.48    170153

