In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

Sarimax Using Feature Engineering

The code loads two CSV files into Pandas DataFrames, parsing the 'Date' column as datetime objects, and then sorts both DataFrames by the 'Date' column. The .head() method displays the first few rows of the sorted 'df' DataFrame.

In [None]:
df = pd.read_csv('train.csv', parse_dates=['Date'])
test=pd.read_csv('test.csv',parse_dates=['Date'])
df = df.sort_values(by='Date')
test = test.sort_values(by='Date')
df.head()


In [None]:
#acf
import statsmodels.api as sm    #from the below graph i see a=1 one spike above threshold .
import matplotlib.pyplot as plt
sm.graphics.tsa.plot_acf(df['Close'], lags=50)
plt.show()

In [None]:
#pacf
import statsmodels.api as sm  #here q = 1 as one spike above
import matplotlib.pyplot as plt
sm.graphics.tsa.plot_pacf(df['Close'], lags=50)
plt.show()

Since pacf and acf graphs above are inconclusive we use pmdarima library too determine p, d , q values .

In [None]:
!pip install pmdarima

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima.arima import auto_arima

Two new features, 'price_change' and 'price_ratio,' are added to the 'df' DataFrame, representing the price change (Close - Open) and price ratio (Close / Open) for each row, respectively, which can provide insights into price movements and trends.

In [None]:
#ADDING FEATURES
df['price_change'] = df['Close'] - df['Open']
df['price_ratio'] = df['Close'] / df['Open']

In [None]:
df['Open_EMA'] = df['Open'].ewm(span = 3, adjust = False).mean()
df['Volume_EMA'] =df['Volume'].ewm(span = 3, adjust = False).mean()

Lag features for 'Open' and 'Volume' columns are created with two time-step lags, providing historical data on the previous two days for these attributes. Backfill (bfill) is applied to handle missing values in the lag features, ensuring continuity in the dataset.

In [None]:
# Create lag features for the 'Open' column
df['open_lag_1'] = df['Open'].shift(1)
df['open_lag_2'] = df['Open'].shift(2)

# Use backfill (bfill) to handle NaN values in the 'Open' lag features
df['open_lag_1'].fillna(method='bfill', inplace=True)
df['open_lag_2'].fillna(method='bfill', inplace=True)

# Create lag features for the 'Volume' column
df['vol_lag_1'] = df['Volume'].shift(1)
df['vol_lag_2'] = df['Volume'].shift(2)

# Use backfill (bfill) to handle NaN values in the 'Volume' lag features
df['vol_lag_1'].fillna(method='bfill', inplace=True)
df['vol_lag_2'].fillna(method='bfill', inplace=True)

In [None]:
df.head()

A subset of columns from the 'df' DataFrame is selected to create a new DataFrame 'features' that includes attributes like 'Open,' 'Volume,' 'Open_EMA,' 'Volume_EMA,' and lag features.

In [None]:
features = df[['Open', 'Volume', 'Open_EMA', 'Volume_EMA', 'open_lag_1', 'open_lag_2', 'vol_lag_1', 'vol_lag_2']]
target = df['Close']

In [None]:
# Split the data into training and testing sets
train_size = int(len(df) * 0.8)  # 80% for training
train_data, test_data = df[:train_size], df[train_size:]


This code performs a grid search to find the optimal (p, d, q, P, D, Q, s) parameters for a SARIMA model using AIC values, iterating through various combinations and storing AIC values in a dictionary. The sorted dictionary provides the parameter combinations with the lowest AIC values, helping identify the best-fitting SARIMA model for the given time series data.

In [None]:
#grid search for sarimax optimal values
import warnings
import itertools
import statsmodels.api as sm
import pandas as pd

warnings.filterwarnings("ignore")

# Define the range of p, d, q, P, D, Q, and s values
p = d = q = range(0, 3)  
P = D = Q = range(0, 2)  
s = 12  

pdq = list(itertools.product(p, d, q))
seasonal_pdq = list(itertools.product(P, D, Q, [s]))

# empty dictionary to store AIC values for each model
store = {}

# Iterate through all (p, d, q, P, D, Q, s) combinations
for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            model_sarima = sm.tsa.SARIMAX(train_data['Close'], order=param, seasonal_order=param_seasonal, enforce_stationarity=False, enforce_invertibility=False)
            model_sarima_fit = model_sarima.fit()
            store[(param, param_seasonal)] = model_sarima_fit.aic
        except:
            continue

# Sort the dictionary by AIC values
sorted_dict = dict(sorted(store.items(), key=lambda item: item[1]))

# Print the sorted dictionary
print(sorted_dict)

This code fits a SARIMAX model with exogenous variables on training data and uses it to make predictions on the testing data. It calculates and prints evaluation metrics like SMAPE, RMSE, and R2 Score to assess the model's performance in forecasting the 'Close' prices.

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt

endog = train_data['Close']
exog = sm.add_constant(train_data[['Open', 'Volume', 'Open_EMA', 'Volume_EMA', 'open_lag_1', 'open_lag_2', 'vol_lag_1', 'vol_lag_2']] )


p, d, q = 1, 1, 2
P, D, Q, S = 0, 1, 1, 12

# Create the SARIMAX model
mod = sm.tsa.statespace.SARIMAX(endog=train_data['Close'], exog=exog[:train_size], order=(p, d, q), seasonal_order=(P, D, Q, S))

# Fit the SARIMAX model on the training data
model_fit = mod.fit(disp=False)

# Make predictions on the testing data
test_exog = sm.add_constant(test_data[['Open', 'Volume', 'Open_EMA', 'Volume_EMA', 'open_lag_1', 'open_lag_2', 'vol_lag_1', 'vol_lag_2']])
predictions = model_fit.predict(start=len(train_data), end=len(train_data) + len(test_data) - 1, exog=test_exog)

# Calculate evaluation metrics
smape = np.mean(np.abs(predictions - test_data['Close']) / (np.abs(predictions) + np.abs(test_data['Close'])))
#smape = np.mean(np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))
rmse = sqrt(mean_squared_error(test_data['Close'], predictions))
r2 = r2_score(test_data['Close'], predictions)

# Display evaluation metrics
print("SMAPE:",smape)
print("RMSE:",rmse)
print("R2 Score:", r2)


This code creates a line plot to visually compare the actual 'Close' prices in the test data with the predicted values, allowing for a visual assessment of the model's performance. The plot shows how closely the model's predictions align with the actual data.

In [None]:
test_data['Close'].plot()
predictions.plot()

In [None]:
test.head()

In [None]:
test['Open_EMA'] = test['Open'].ewm(span = 3, adjust = False).mean()
test['Volume_EMA']= test['Volume'].ewm(span = 3, adjust = False).mean()

Similar to the previous code for the training data, this code creates lag features for the 'Open' and 'Volume' columns in the 'test' DataFrame, providing historical data from the previous two days. Backfill (bfill) is used to handle missing values in these lag features, ensuring consistency in the dataset for testing and prediction.

In [None]:
# Create lag features for the 'Open' column
test['open_lag_1'] = test['Open'].shift(1)
test['open_lag_2'] = test['Open'].shift(2)

# Use backfill (bfill) to handle NaN values in the 'Open' lag features
test['open_lag_1'].fillna(method='bfill', inplace=True)
test['open_lag_2'].fillna(method='bfill', inplace=True)

# Create lag features for the 'Volume' column
test['vol_lag_1'] = test['Volume'].shift(1)
test['vol_lag_2'] = test['Volume'].shift(2)

# Use backfill (bfill) to handle NaN values in the 'Volume' lag features
test['vol_lag_1'].fillna(method='bfill', inplace=True)
test['vol_lag_2'].fillna(method='bfill', inplace=True)


In [None]:
test.head()


This code prepares exogenous variables for the test dataset and uses the previously trained SARIMAX model ('model_fit') to make predictions for the 'Close' prices in the test data, incorporating the exogenous features.

In [None]:
t_exog = sm.add_constant(test[['Open', 'Volume', 'Open_EMA', 'Volume_EMA', 'open_lag_1', 'open_lag_2', 'vol_lag_1', 'vol_lag_2']])
p = model_fit.predict(steps=len(test), exog=t_exog) #training the model on dataset

In [None]:
p

In [None]:
test['Close']=p #final predictions are saved here

In [None]:
test.head(100)

This code performs binary classification on financial data using a Voting Classifier that combines Random Forest and XGBoost models to predict trading strategies. It then applies the trained model to a test dataset, saves the predictions, and creates a submission CSV file for further analysis or competition submission.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' contains your dataset

# Feature engineering - Adding new features based on existing ones
df['Open_Close_diff'] = df['Open'] - df['Close']  # Difference between Open and Close
df['Volume_squared'] = df['Volume'] ** 2  # Volume squared

# Encoding the 'Strategy' column
label_encoder = LabelEncoder()
df['Strategy'] = label_encoder.fit_transform(df['Strategy'])

# Splitting the data into features and target variable
X = df[['Open', 'Close', 'Volume', 'Open_Close_diff', 'Volume_squared']]
y = df['Strategy']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

# Initializing Random Forest and XGBoost classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest Classifier
xgb_classifier = XGBClassifier(n_estimators=100, objective='multi:softmax', num_class=3)  # XGBoost Classifier

# Creating a Voting Classifier that combines Random Forest and XGBoost classifiers
voting_classifier = VotingClassifier(estimators=[('rf', rf_classifier), ('xgb', xgb_classifier)], voting='soft')

# Fitting the Voting Classifier on the training data
voting_classifier.fit(X_train, y_train)

# Making predictions on the test data
y_pred = voting_classifier.predict(X_test)

# Inverse transforming the encoded predictions to their original labels
y_pred = label_encoder.inverse_transform(y_pred)

# Assuming 'test' is another DataFrame for which you want predictions
# Feature engineering for the test data
test['Open_Close_diff'] = test['Open'] - test['Close']
test['Volume_squared'] = test['Volume'] ** 2

# Selecting features for the test data
X_test = test[['Open', 'Close', 'Volume', 'Open_Close_diff', 'Volume_squared']]

# Making predictions on the 'test' data
y_pred_test = voting_classifier.predict(X_test)

# Inverse transforming the encoded predictions to their original labels for 'test' data
y_pred_test = label_encoder.inverse_transform(y_pred_test)

# Adding predicted strategies to the 'test' DataFrame
test['Predicted_Strategy'] = y_pred_test

# Creating a submission DataFrame for the final predictions
submission = pd.DataFrame()
submission["id"] = test["id"]
submission["Date"] = test["Date"]
submission["Close"] = test["Close"]
submission["Strategy"] = test["Predicted_Strategy"]

# Displaying the first few rows of the submission DataFrame
submission.head()

# Saving the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)