In [1]:
import pandas as pd
from skopt import BayesSearchCV
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from tabulate import tabulate
from sklearn.model_selection import cross_val_score
import joblib
from sklearn.model_selection import KFold
import requests
from io import StringIO

In [2]:
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/processed/train_data.csv?token=GHSAT0AAAAAACC4ZCNLOP4TN3MBMEYA6GEUZGNHKKQ"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    train = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'train' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/processed/val_data.csv?token=GHSAT0AAAAAACC4ZCNKDUQVUFIGBMWZ7GNAZGNHKUA"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'valid' variable
    valid = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'valid' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/processed/test_data.csv?token=GHSAT0AAAAAACC4ZCNL3AJKGBYNDLDQQFUMZGNHKAA"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    test = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'test' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

Dataset downloaded and loaded into 'train' successfully.
Dataset downloaded and loaded into 'valid' successfully.
Dataset downloaded and loaded into 'test' successfully.


In [3]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

possible_values = [-1.00, -0.75, -0.50, -0.25, 0.00, 0.25, 0.50, 0.75, 1.00]

def round_to_nearest(value, possible_values):
    return min(possible_values, key=lambda x: abs(x - value))

# Function to select top features based on feature importances
def select_top_features(feature_importances, max_features):
    # Get indices of top features based on feature importances
    top_feature_indices = np.argsort(feature_importances)[::-1][:max_features]
    return top_feature_indices

###Regression Tree Model

In [4]:
# Initialize the regression tree model
regression_tree_model = DecisionTreeRegressor()

# Fit the model to the training data
regression_tree_model.fit(X_train, y_train)

# Predict y_train_pred on the training set
y_train_pred = regression_tree_model.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = regression_tree_model.predict(X_valid)

In [5]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |              0 |        0.0390625 |
+--------------+----------------+------------------+
| RMSE         |              0 |        0.197642  |
+--------------+----------------+------------------+
| R^2          |              1 |        0.398998  |
+--------------+----------------+------------------+
| Adjusted R^2 |              1 |        1.19469   |
+--------------+----------------+------------------+
| Accuracy     |              1 |        0.5       |
+--------------+----------------+------------------+


###Random Forest Model

###Manual Hyperparameter Selection

In [46]:
# Create a random forest regressor with 100 trees
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Predict y_train_pred on the training set
y_train_pred = rf_model.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = rf_model.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

In [47]:
# Get the feature importances from the best model
feature_importances = rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

88

In [48]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00260417 |        0.0364583 |
+--------------+----------------+------------------+
| RMSE         |     0.051031   |        0.190941  |
+--------------+----------------+------------------+
| R^2          |     0.934957   |        0.439065  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.871925   |        1.18171   |
+--------------+----------------+------------------+
| Accuracy     |     0.958333   |        0.666667  |
+--------------+----------------+------------------+


In [9]:
# Save the best_rf_model to the specified location
model_filename = r"C:\Users\abact\BC-Project\models\best_random_forest_model_r2.joblib"
joblib.dump(rf_model, model_filename)

['C:\\Users\\abact\\BC-Project\\models\\best_random_forest_model_r2.joblib']

###Optimal Parameter Selection

In [10]:
# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],  # Set n_estimators to any number between 1 and number of variables
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = best_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = best_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

In [11]:
# Get the feature importances from the best model
feature_importances = best_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

85

In [12]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00358073 |        0.0286458 |
+--------------+----------------+------------------+
| RMSE         |     0.0598392  |        0.169251  |
+--------------+----------------+------------------+
| R^2          |     0.910565   |        0.559265  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.823897   |        1.14277   |
+--------------+----------------+------------------+
| Accuracy     |     0.942708   |        0.666667  |
+--------------+----------------+------------------+


In [13]:
# Set a random seed for reproducibility
np.random.seed(1)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create a random forest regressor
rf_model = RandomForestRegressor()

# List to store random seeds used in each iteration
random_seeds = []

# Find the optimal number of cross-validation folds and the best random seed
best_cv_score = -np.inf
optimal_cv = None
best_random_seed = None

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    # Generate a random seed and store it
    random_seed = np.random.randint(1, 1000)
    random_seeds.append(random_seed)
    
    # Create a random forest regressor with the current random seed
    rf_model = RandomForestRegressor(random_state=random_seed)
    
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    mean_cv_score = grid_search.best_score_
    
    if mean_cv_score > best_cv_score:
        best_cv_score = mean_cv_score
        optimal_cv = cv
        best_random_seed = random_seed

# Use the optimal number of folds and the best random seed in GridSearchCV
best_rf_model_r2 = RandomForestRegressor(random_state=best_random_seed)
grid_search = GridSearchCV(estimator=best_rf_model_r2, param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model_r2 = grid_search.best_estimator_

# Print the random seeds used in each iteration
print("Best Random Seed:", best_random_seed)

# Predict y_train_pred on the training set
y_train_pred_r2 = best_rf_model_r2.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred_r2 = best_rf_model_r2.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred_r2 = [round_to_nearest(val, possible_values) for val in y_train_pred_r2]
y_valid_pred_r2 = [round_to_nearest(val, possible_values) for val in y_valid_pred_r2]

Best Random Seed: 73


In [14]:
# Get the feature importances from the best model
feature_importances = best_rf_model_r2.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

60

In [15]:
# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

# Calculate Mean Squared Error (MSE) for training set
mse_train_r2 = mean_squared_error(y_train, y_train_pred_r2)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid_r2 = mean_squared_error(y_valid, y_valid_pred_r2)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train_r2 = np.sqrt(mse_train_r2)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid_r2 = np.sqrt(mse_valid_r2)

# Calculate R-squared (R^2) for training set
r2_train_r2 = r2_score(y_train, y_train_pred_r2)

# Calculate R-squared (R^2) for validation set
r2_valid_r2 = r2_score(y_valid, y_valid_pred_r2)

# Calculate adjusted R-squared for training set
n_train_r2 = X_train.shape[0]
p_train_r2 = X_train.shape[1]
adj_r2_train_r2 = 1 - ((1 - r2_train_r2) * (n_train_r2 - 1) / (n_train_r2 - p_train_r2 - 1))

# Calculate adjusted R-squared for validation set
n_valid_r2 = X_valid.shape[0]
p_valid_r2 = X_valid.shape[1]
adj_r2_valid_r2 = 1 - ((1 - r2_valid_r2) * (n_valid_r2 - 1) / (n_valid_r2 - p_valid_r2 - 1))

# Define the threshold for accuracy calculation
threshold = 0.1

# Calculate accuracy for training and validation sets
accuracy_train_r2 = calculate_accuracy(y_train, y_train_pred_r2, threshold)
accuracy_valid_r2 = calculate_accuracy(y_valid, y_valid_pred_r2, threshold)

# Prepare the data for the table
data_r2 = [
    ["MSE", mse_train_r2, mse_valid_r2],
    ["RMSE", rmse_train_r2, rmse_valid_r2],
    ["R^2", r2_train_r2, r2_valid_r2],
    ["Adjusted R^2", adj_r2_train_r2, adj_r2_valid_r2],
    ["Accuracy", accuracy_train_r2, accuracy_valid_r2],
]

# Prepare the headers for the table
headers_r2 = ["Metric", "Training Set", "Validation Set"]

# Display the table
table_r2 = tabulate(data_r2, headers=headers_r2, tablefmt="grid")
print(table_r2)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |      0.0061849 |        0.0286458 |
+--------------+----------------+------------------+
| RMSE         |      0.0786441 |        0.169251  |
+--------------+----------------+------------------+
| R^2          |      0.845522  |        0.559265  |
+--------------+----------------+------------------+
| Adjusted R^2 |      0.695822  |        1.14277   |
+--------------+----------------+------------------+
| Accuracy     |      0.901042  |        0.666667  |
+--------------+----------------+------------------+


In [17]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    best_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = best_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = best_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the random seeds used in each iteration
print("Random Seeds:", random_seeds)

Random Seeds: [2629073562, 2629073562, 2629073562, 2629073562, 2629073562, 2629073562, 2629073562, 2629073562, 2629073562]


In [18]:
# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Featurees Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

Number of Featurees Used: 68
Random Seed Used: 2629073562


In [19]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00488281 |        0.0364583 |
+--------------+----------------+------------------+
| RMSE         |     0.0698771  |        0.190941  |
+--------------+----------------+------------------+
| R^2          |     0.878044   |        0.439065  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.759859   |        1.18171   |
+--------------+----------------+------------------+
| Accuracy     |     0.921875   |        0.666667  |
+--------------+----------------+------------------+


In [43]:
# Set the specific random seed and number of features
specific_random_seed = 2629073562
num_features = 44
np.random.seed(specific_random_seed)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(n_estimators=num_features), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    best_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = best_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = best_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for validation set after rounding
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(n_estimators=num_features), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the specific random seed and number of features used
print("Specific Random Seed:", specific_random_seed)
print("Number of Features:", num_features)

Specific Random Seed: 2629073562
Number of Features: 44


In [44]:
# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Featurees Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

Number of Featurees Used: 35
Random Seed Used: 2629073562


In [45]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00358073 |        0.0260417 |
+--------------+----------------+------------------+
| RMSE         |     0.0598392  |        0.161374  |
+--------------+----------------+------------------+
| R^2          |     0.910565   |        0.599332  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.823897   |        1.12979   |
+--------------+----------------+------------------+
| Accuracy     |     0.942708   |        0.708333  |
+--------------+----------------+------------------+


In [20]:
# Save the best_rf_model to the specified location
model_filename = r"C:\Users\abact\BC-Project\models\best_random_forest_model.joblib"
joblib.dump(best_rf_model, model_filename)

['C:\\Users\\abact\\BC-Project\\models\\best_random_forest_model.joblib']

###Support Vector Machine

In [21]:
# Create the SVR model
svr_model = SVR()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1.0, 10.0],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with the optimal hyperparameters
best_svr_model = grid_search.best_estimator_

# Predict y_train_pred and y_valid_pred using the best model
y_train_pred = best_svr_model.predict(X_train)
y_valid_pred = best_svr_model.predict(X_valid)

In [22]:
# Convert fitted values to the closest allowed values for the validation set
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Convert fitted values to the closest allowed values for the validation set
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [23]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.0403646  |       0.0651042  |
+--------------+----------------+------------------+
| RMSE         |     0.200909   |       0.255155   |
+--------------+----------------+------------------+
| R^2          |    -0.00817277 |      -0.00166945 |
+--------------+----------------+------------------+
| Adjusted R^2 |    -0.985165   |       1.32448    |
+--------------+----------------+------------------+
| Accuracy     |     0.697917   |       0.583333   |
+--------------+----------------+------------------+


In [24]:
# Create the SVR model
svr_model = SVR()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1.0, 10.0],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=3, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with the optimal hyperparameters
best_svr_model = grid_search.best_estimator_

# Predict y_train_pred and y_valid_pred using the best model
y_train_pred = best_svr_model.predict(X_train)
y_valid_pred = best_svr_model.predict(X_valid)

In [25]:
# Convert fitted values to the closest allowed values for the validation set
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Convert fitted values to the closest allowed values for the validation set
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [26]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.0403646  |       0.0651042  |
+--------------+----------------+------------------+
| RMSE         |     0.200909   |       0.255155   |
+--------------+----------------+------------------+
| R^2          |    -0.00817277 |      -0.00166945 |
+--------------+----------------+------------------+
| Adjusted R^2 |    -0.985165   |       1.32448    |
+--------------+----------------+------------------+
| Accuracy     |     0.697917   |       0.583333   |
+--------------+----------------+------------------+


In [27]:
# Create the SVR model
svr_model = SVR()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1.0, 10.0],
}

# Create GridSearchCV with 7-fold cross-validation
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=7, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with the optimal hyperparameters
best_svr_model = grid_search.best_estimator_

# Predict y_train_pred and y_valid_pred using the best model
y_train_pred = best_svr_model.predict(X_train)
y_valid_pred = best_svr_model.predict(X_valid)

# Convert fitted values to the closest allowed values for the validation set
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Convert fitted values to the closest allowed values for the validation set
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [28]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.0403646  |       0.0651042  |
+--------------+----------------+------------------+
| RMSE         |     0.200909   |       0.255155   |
+--------------+----------------+------------------+
| R^2          |    -0.00817277 |      -0.00166945 |
+--------------+----------------+------------------+
| Adjusted R^2 |    -0.985165   |       1.32448    |
+--------------+----------------+------------------+
| Accuracy     |     0.697917   |       0.583333   |
+--------------+----------------+------------------+


In [29]:
# Create the SVR model
svr_model = SVR()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1.0, 3.0, 5.0, 7.0, 10.0],
}

# Create GridSearchCV with 3-fold cross-validation
grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=3, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with the optimal hyperparameters
best_svr_model = grid_search.best_estimator_

# Predict y_train_pred and y_valid_pred using the best model
y_train_pred = best_svr_model.predict(X_train)
y_valid_pred = best_svr_model.predict(X_valid)

# Convert fitted values to the closest allowed values for the validation set
y_train_pred = [round_to_nearest(pred, possible_values) for pred in y_train_pred]

# Convert fitted values to the closest allowed values for the validation set
y_valid_pred = [round_to_nearest(pred, possible_values) for pred in y_valid_pred]

In [30]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.0403646  |       0.0651042  |
+--------------+----------------+------------------+
| RMSE         |     0.200909   |       0.255155   |
+--------------+----------------+------------------+
| R^2          |    -0.00817277 |      -0.00166945 |
+--------------+----------------+------------------+
| Adjusted R^2 |    -0.985165   |       1.32448    |
+--------------+----------------+------------------+
| Accuracy     |     0.697917   |       0.583333   |
+--------------+----------------+------------------+
