In [1]:
import pandas as pd
from skopt import BayesSearchCV
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train = pd.read_csv(r'C:\Users\abact\BC-Project\data\train_data.csv')
valid = pd.read_csv(r'C:\Users\abact\BC-Project\data\val_data.csv')
test = pd.read_csv(r'C:\Users\abact\BC-Project\data\test_data.csv')

In [3]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

In [4]:
train['Direction'] = np.select([
    train['Increase'] > 0,
    train['Decrease'] > 0,
    train['Difference'] == 0
], [
    1,  # Increase
    -1,  # Decrease
    0   # Hold
], default=-1)

valid['Direction'] = np.select([
    valid['Increase'] > 0,
    valid['Decrease'] > 0,
    valid['Difference'] == 0
], [
    1,  # Increase
    -1,  # Decrease
    0   # Hold
], default=-1)

###Multinomial Logistic Regression Model

In [5]:
# Select the independent variables (excluding specified variables)
exclude_vars = ['Increase', 'Decrease', 'Date', 'Difference']
X_train = train.drop(['Direction'] + exclude_vars, axis=1)

# Separate the target variable
y_train = train['Direction']

# Scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit a multinomial logistic regression model
logit_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logit_model.fit(X_train_scaled, y_train)

# Evaluate the model on the training data
accuracy = logit_model.score(X_train_scaled, y_train)
print("Accuracy:", accuracy)

Accuracy: 0.9319371727748691


In [6]:
# Preprocess the validation set
X_valid = valid.drop(['Direction'] + exclude_vars, axis=1)
y_valid = valid['Direction']

# Scale the independent variables of the validation set using the same scaler object
X_valid_scaled = scaler.transform(X_valid)

# Predict class labels for the validation set
y_valid_pred = logit_model.predict(X_valid_scaled)

# Calculate accuracy
accuracy = logit_model.score(X_valid_scaled, y_valid)

# Calculate cross-entropy loss (log loss)
loss = log_loss(y_valid, logit_model.predict_proba(X_valid_scaled))

# Calculate AICc
nll = -log_loss(y_valid, logit_model.predict_proba(X_valid_scaled), normalize=False)
n = len(X_valid)
p = logit_model.coef_.shape[1]
aic = 2 * p + 2 * nll + (2 * p * (p + 1)) / (n - p - 1)

# Print evaluation metrics
print("Accuracy:", accuracy)
print("Cross-entropy loss:", loss)
print("AICc:", aic)

Accuracy: 0.5
Cross-entropy loss: 1.1448712063854016
AICc: -118.50311368114717


###Random Forest Classifier

In [7]:
param_space = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 10, 20],
    # Add other hyperparameters and their values here
}

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Perform Bayesian Search for hyperparameter tuning
opt_model = BayesSearchCV(rf_model, param_space, n_iter=10, cv=5, random_state=42)
opt_model.fit(X_train, y_train)

# Get the best hyperparameters and model performance
best_params = opt_model.best_params_
best_score = opt_model.best_score_

# Fit the Random Forest model with the best hyperparameters
rf_model_best = RandomForestClassifier(**best_params)
rf_model_best.fit(X_train, y_train)

# Evaluate the model on the training data
accuracy = rf_model_best.score(X_train, y_train)
print("Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Accuracy: 0.9528795811518325
Best Hyperparameters: OrderedDict([('max_depth', 5), ('n_estimators', 100)])
Best Score: 0.5456140350877192


In [8]:
# Preprocess the validation set
X_valid = valid.drop(['Direction'] + exclude_vars, axis=1)
y_valid = valid['Direction']

# Predict class labels for the validation set using the best model
y_valid_pred = rf_model_best.predict(X_valid)

# Calculate accuracy
accuracy = rf_model_best.score(X_valid, y_valid)
print("Accuracy:", accuracy)

# Calculate cross-entropy loss (log loss)
loss = log_loss(y_valid, rf_model_best.predict_proba(X_valid))
print("Cross-entropy loss:", loss)

# Calculate AICc
nll = -log_loss(y_valid, rf_model_best.predict_proba(X_valid), normalize=False)
n = len(X_valid)
p = X_valid.shape[1]  # Number of features in X_valid
aic = 2 * p + 2 * nll + (2 * p * (p + 1)) / (n - p - 1)
print("AICc:", aic)

Accuracy: 0.2916666666666667
Cross-entropy loss: 1.0906148069620647
AICc: -115.89880650882697


In [9]:
# Select the independent variables (excluding specified variables)
exclude_vars = ['Increase', 'Decrease', 'Date', 'Difference']
X_train = train.drop(['Direction'] + exclude_vars, axis=1)

# Separate the target variable
y_train = train['Direction']

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the Random Forest model to the training data
rf_model.fit(X_train, y_train)

# Evaluate the model on the training data
accuracy = rf_model.score(X_train, y_train)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [10]:
# Preprocess the validation set
X_valid = valid.drop(['Direction'] + exclude_vars, axis=1)
y_valid = valid['Direction']

# Predict class labels for the validation set
y_valid_pred = rf_model.predict(X_valid)

# Calculate accuracy
accuracy = rf_model.score(X_valid, y_valid)
print("Accuracy:", accuracy)

# Calculate cross-entropy loss (log loss)
loss = log_loss(y_valid, rf_model.predict_proba(X_valid))
print("Cross-entropy loss:", loss)

# Calculate AICc
nll = -log_loss(y_valid, rf_model.predict_proba(X_valid), normalize=False)
n = len(X_valid)
p = X_valid.shape[1]  # Number of features in X_valid
aic = 2 * p + 2 * nll + (2 * p * (p + 1)) / (n - p - 1)
print("AICc:", aic)

Accuracy: 0.625
Cross-entropy loss: 0.9983295308113065
AICc: -111.46911325359059


###Support Vector Machine

In [11]:
# Select the independent variables (excluding specified variables)
exclude_vars = ['Increase', 'Decrease', 'Date', 'Difference']
X_train = train.drop(['Direction'] + exclude_vars, axis=1)

# Separate the target variable
y_train = train['Direction']

# Scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit a SVM classifier
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_scaled, y_train)

# Evaluate the model on the training data
accuracy = svm_model.score(X_train_scaled, y_train)
print("Accuracy:", accuracy)

Accuracy: 0.9424083769633508


In [12]:
# Preprocess the validation set
X_valid = valid.drop(['Direction'] + exclude_vars, axis=1)
y_valid = valid['Direction']

# Scale the independent variables of the validation set using the same scaler object
X_valid_scaled = scaler.transform(X_valid)

# Predict class labels for the validation set
y_valid_pred = svm_model.predict(X_valid_scaled)

# Calculate accuracy
accuracy = svm_model.score(X_valid_scaled, y_valid)
print("Accuracy:", accuracy)

# Calculate cross-entropy loss (log loss)
loss = log_loss(y_valid, svm_model.predict_proba(X_valid_scaled))
print("Cross-entropy loss:", loss)

# Calculate AICc
nll = -log_loss(y_valid, svm_model.predict_proba(X_valid_scaled), normalize=False)
n = len(X_valid)
p = X_valid.shape[1]  # Number of features in X_valid
aic = 2 * p + 2 * nll + (2 * p * (p + 1)) / (n - p - 1)
print("AICc:", aic)

Accuracy: 0.5
Cross-entropy loss: 1.0141586731506265
AICc: -112.22891208587794


###Magnitude Testing

In [13]:
# Create logit_increase subset
train_logit_increase = train[train['Increase'] > 0].copy()
train_logit_increase = train_logit_increase.drop(['Decrease', 'Date', 'Difference', 'Direction'], axis=1)

# Create logit_decrease subset
train_logit_decrease = train[train['Decrease'] > 0].copy()
train_logit_decrease = train_logit_decrease.drop(['Increase', 'Date', 'Difference', 'Direction'], axis=1)

# Convert 'Increase' and 'Decrease' variables to categorical
train_logit_increase['Increase'] = train_logit_increase['Increase'].astype('category')
train_logit_decrease['Decrease'] = train_logit_decrease['Decrease'].astype('category')

# Create logit_increase subset
valid_logit_increase = valid[valid['Increase'] > 0].copy()
valid_logit_increase = valid_logit_increase.drop(['Decrease', 'Date', 'Difference', 'Direction'], axis=1)

# Create logit_decrease subset
valid_logit_decrease = valid[valid['Decrease'] > 0].copy()
valid_logit_decrease = valid_logit_decrease.drop(['Increase', 'Date', 'Difference', 'Direction'], axis=1)

# Convert 'Increase' and 'Decrease' variables to categorical
valid_logit_increase['Increase'] = valid_logit_increase['Increase'].astype('category')
valid_logit_decrease['Decrease'] = valid_logit_decrease['Decrease'].astype('category')

In [14]:
unique_values = train_logit_increase['Increase'].unique()
print(unique_values)
unique_values = train_logit_decrease['Decrease'].unique()
print(unique_values)

[0.25, 0.50, 0.75]
Categories (3, float64): [0.25, 0.50, 0.75]
[0.50, 0.25, 0.75]
Categories (3, float64): [0.25, 0.50, 0.75]


  print(unique_values)
  print(unique_values)


###Multinomial Logistic Regression

In [15]:
# Select the independent variables (excluding specified variables)
X_train = train_logit_increase.drop(['Increase'], axis=1)

# Encode the target variable as ordinal
ordinal_encoder = OrdinalEncoder(categories=[[0.25, 0.50, 0.75, 1.00]], handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = ordinal_encoder.fit_transform(train_logit_increase[['Increase']])

# Reshape the target variable to a 1d array
y_train_encoded = y_train_encoded.ravel()

# Scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit a multinomial logistic regression model
logit_increase = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logit_increase.fit(X_train_scaled, y_train_encoded)

# Evaluate the model on the training data
accuracy = logit_increase.score(X_train_scaled, y_train_encoded)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [16]:
# Encode the target variable of the validation set as ordinal using the fitted ordinal_encoder from train_logit_increase
ordinal_encoder = OrdinalEncoder(categories=[[0.25, 0.50, 0.75, 1.00]], handle_unknown='use_encoded_value', unknown_value=-1)
y_valid_encoded = ordinal_encoder.fit_transform(valid_logit_increase[['Increase']])

# Scale the independent variables of the validation set using the same scaler object
X_valid_scaled = scaler.transform(valid_logit_increase[X_train.columns])

# Predict class labels for the validation set using the trained logit_increase model
y_valid_pred = logit_increase.predict(X_valid_scaled)

# Calculate accuracy
accuracy_valid = accuracy_score(y_valid_encoded, y_valid_pred)
print("Validation Accuracy:", accuracy_valid)

Validation Accuracy: 1.0


In [17]:
# Select the independent variables (excluding specified variables)
X_train = train_logit_decrease.drop(['Decrease'], axis=1)

# Encode the target variable as ordinal
ordinal_encoder = OrdinalEncoder(categories=[[0.25, 0.50, 0.75, 1.00]], handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = ordinal_encoder.fit_transform(train_logit_decrease[['Decrease']])

# Reshape the target variable to a 1d array
y_train_encoded = y_train_encoded.ravel()

# Scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit a multinomial logistic regression model
logit_decrease = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
logit_decrease.fit(X_train_scaled, y_train_encoded)

# Evaluate the model on the training data
accuracy = logit_decrease.score(X_train_scaled, y_train_encoded)
print("Accuracy:", accuracy)

Accuracy: 0.9583333333333334


In [18]:
# Encode the target variable of the validation set as ordinal using the fitted ordinal_encoder from train_logit_decrease
ordinal_encoder = OrdinalEncoder(categories=[[0.25, 0.50, 0.75, 1.00]], handle_unknown='use_encoded_value', unknown_value=-1)
y_valid_encoded = ordinal_encoder.fit_transform(valid_logit_decrease[['Decrease']])

# Scale the independent variables of the validation set using the same scaler object
X_valid_scaled = scaler.transform(valid_logit_decrease[X_train.columns])

# Predict class labels for the validation set using the trained logit_decrease model
y_valid_pred = logit_decrease.predict(X_valid_scaled)

# Calculate accuracy
accuracy_valid = accuracy_score(y_valid_encoded, y_valid_pred)
print("Validation Accuracy:", accuracy_valid)

Validation Accuracy: 0.0


###Random Forest Classifier

In [19]:
# Select the independent variables (excluding specified variables)
X_train = train_logit_increase.drop(['Increase'], axis=1)

# Encode the target variable as ordinal
ordinal_encoder = OrdinalEncoder(categories=[[0.25, 0.50, 0.75, 1.00]], handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = ordinal_encoder.fit_transform(train_logit_increase[['Increase']])

# Reshape the target variable to a 1d array
y_train_encoded = y_train_encoded.ravel()

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit the Random Forest model
rf_model.fit(X_train_scaled, y_train_encoded)

# Evaluate the model on the training data
accuracy = rf_model.score(X_train_scaled, y_train_encoded)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [20]:
# Preprocess the validation set
X_valid = valid_logit_increase.drop(['Increase'], axis=1)
y_valid_encoded = ordinal_encoder.transform(valid_logit_increase[['Increase']])
y_valid_encoded = y_valid_encoded.ravel()
X_valid_scaled = scaler.transform(X_valid)

# Predict class labels for the validation set
y_valid_pred = rf_model.predict(X_valid_scaled)

# Calculate accuracy
accuracy_valid = accuracy_score(y_valid_encoded, y_valid_pred)
print("Validation Accuracy:", accuracy_valid)

Validation Accuracy: 1.0


In [21]:
# Select the independent variables (excluding specified variables)
X_train = train_logit_decrease.drop(['Decrease'], axis=1)

# Encode the target variable as ordinal
ordinal_encoder = OrdinalEncoder(categories=[[0.25, 0.50, 0.75, 1.00]], handle_unknown='use_encoded_value', unknown_value=-1)
y_train_encoded = ordinal_encoder.fit_transform(train_logit_decrease[['Decrease']])

# Reshape the target variable to a 1d array
y_train_encoded = y_train_encoded.ravel()

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Scale the independent variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Fit the Random Forest model
rf_model.fit(X_train_scaled, y_train_encoded)

# Evaluate the model on the training data
accuracy = rf_model.score(X_train_scaled, y_train_encoded)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [22]:
# Preprocess the validation set
X_valid = valid_logit_decrease.drop(['Decrease'], axis=1)
y_valid_encoded = ordinal_encoder.transform(valid_logit_decrease[['Decrease']])
y_valid_encoded = y_valid_encoded.ravel()
X_valid_scaled = scaler.transform(X_valid)

# Predict class labels for the validation set
y_valid_pred = rf_model.predict(X_valid_scaled)

# Calculate accuracy
accuracy_valid = accuracy_score(y_valid_encoded, y_valid_pred)
print("Validation Accuracy:", accuracy_valid)

Validation Accuracy: 0.75
