In [328]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import random
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [94]:
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/dataset_adjusted.csv?token=GHSAT0AAAAAACC4ZCNKN5F6XR7HZA75QWTEZGQVYIA"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    dataset_adjusted = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'dataset_adjusted' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/raw/dataset_words.csv?token=GHSAT0AAAAAACC4ZCNKDB2KTQ7KVOHGVHMAZGQVX7A"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'valid' variable
    words = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'words' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

Dataset downloaded and loaded into 'dataset_adjusted' successfully.
Dataset downloaded and loaded into 'words' successfully.


In [145]:
# Load the sentiment word list from the CSV file into a dictionary
sentiment_word_list = {}
with open(r"C:\Users\abact\BC-Project\data\external\Loughran-McDonald_MasterDictionary_1993-2021.csv", 'r') as file:
    # Skip the header line
    next(file)

    for line in file:
        values = line.strip().split(',')

        # Extract the necessary values
        word = values[0].lower()
        positive = float(values[8])  # Positive column index is 8
        negative = float(values[7])  # Negative column index is 7

        # Assign the word as positive or negative based on the positive or negative values
        if positive == 2009:
            sentiment_word_list[word] = 1
        elif negative == 2009:
            sentiment_word_list[word] = -1

# Convert the sentiment word list keys to lowercase
selected_words = set(sentiment_word_list.keys())

# Filter the 'words' DataFrame to include only columns that are present in both 'selected_words' and 'words'
common_columns = selected_words.intersection(words.columns)
subset_words = words[list(common_columns)].copy()

# Multiply sentiment values to the vectorized text columns in the 'subset_words' DataFrame
for column in subset_words.columns:
    sentiment_value = sentiment_word_list.get(column, 0)
    if sentiment_value == 1:
        subset_words.loc[:, column] = subset_words[column] * 1  # Multiply by 1 for positive sentiment
    elif sentiment_value == -1:
        subset_words.loc[:, column] = subset_words[column] * -1  # Multiply by -1 for negative sentiment
    else:
        subset_words.loc[:, column] = subset_words[column] * 0  # Multiply by 0 for unknown sentiment

In [146]:
# Concatenate the vectorized DataFrame with the original dataset
full_dataset = pd.concat([dataset_adjusted, subset_words], axis=1)

full_dataset['Date'] = pd.to_datetime(full_dataset['Date'])

# Calculate the time difference in days from the first date
full_dataset['Date'] = (full_dataset['Date'] - full_dataset['Date'].min()).dt.days

In [147]:
# Check for the number of missing values in 'train' DataFrame
missing_values_count = full_dataset.isna().sum()

# Print the count of missing values for each column
print(missing_values_count)

Federal_Reserve_Mins    0
Preprocessed Text       0
Date                    0
Difference              0
Increase                0
                       ..
enhance                 0
unanticipated           0
burdensome              0
impressive              0
lose                    0
Length: 837, dtype: int64


In [148]:
# Filter columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Print columns with missing values and their counts
print("Columns with Missing Values:")
for column, count in columns_with_missing_values.items():
    print(f"{column}: {count}")

Columns with Missing Values:


In [149]:
# Split into training, validation, and test sets
train, valid = train_test_split(full_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [150]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the test dataset
test[variables_to_convert] = test[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [151]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

possible_values = [-1.00, -0.75, -0.50, -0.25, 0.00, 0.25, 0.50, 0.75, 1.00]

def round_to_nearest(value, possible_values):
    return min(possible_values, key=lambda x: abs(x - value))

In [152]:
# Check for missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'train'
train = train.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'valid' DataFrame
missing_values_count = valid.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'valid'
valid = valid.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'test' DataFrame
missing_values_count = test.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'test'
test = test.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_test = test.drop(columns=['Difference', 'Increase', 'Decrease'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [153]:
# Check for the number of missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Print the count of missing values for each column
print(missing_values_count)

# Filter columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Print columns with missing values and their counts
print("Columns with Missing Values:")
for column, count in columns_with_missing_values.items():
    print(f"{column}: {count}")

Date             0
Difference       0
Increase         0
Decrease         0
Level            0
                ..
enhance          0
unanticipated    0
burdensome       0
impressive       0
lose             0
Length: 833, dtype: int64
Columns with Missing Values:


In [165]:
# Set the random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = best_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = best_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

# Output the random seed
print("Random seed:", random_seed)

Random seed: 42


In [166]:
# Get the feature importances from the best model
feature_importances = best_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

99

In [167]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00358073 |        0.0260417 |
+--------------+----------------+------------------+
| RMSE         |     0.0598392  |        0.161374  |
+--------------+----------------+------------------+
| R^2          |     0.910565   |        0.599332  |
+--------------+----------------+------------------+
| Adjusted R^2 |     1.02673    |        1.01142   |
+--------------+----------------+------------------+
| Accuracy     |     0.942708   |        0.708333  |
+--------------+----------------+------------------+


In [88]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    # Set the random seed for reproducibility
    random_seed = cv  # Use cv as the random seed
    random_seeds.append(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    best_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = best_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = best_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the random seeds used in each iteration
print("Random Seeds:", random_seeds)

Random Seeds: [2, 3, 4, 5, 6, 7, 8, 9, 10]


In [89]:
# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

50

In [91]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

threshold = 0.1
accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00455729 |         0.03125  |
+--------------+----------------+------------------+
| RMSE         |     0.0675077  |         0.176777 |
+--------------+----------------+------------------+
| R^2          |     0.886174   |         0.519199 |
+--------------+----------------+------------------+
| Adjusted R^2 |     1.00323    |         1.0016   |
+--------------+----------------+------------------+
| Accuracy     |     0.927083   |         0.625    |
+--------------+----------------+------------------+


###Data Centric AI

In [157]:
subset_words.describe()

Unnamed: 0,divestiture,unfounded,caution,hampering,unnecessarily,shutdown,ineffective,trouble,conspired,enable,...,disrupt,worsened,downturn,worsen,mistaken,enhance,unanticipated,burdensome,impressive,lose
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-0.004167,-0.004167,-3.975,-0.004167,-0.154167,-0.975,-0.020833,-0.025,-0.004167,0.391667,...,-0.025,-0.570833,-3.354167,-0.941667,-0.0125,1.9,-2.7,-0.004167,0.933333,-0.020833
std,0.06455,0.06455,14.327878,0.06455,2.324416,9.95888,0.143125,0.156451,0.06455,2.726887,...,0.156451,3.247655,13.131918,5.056659,0.111335,7.92739,12.590493,0.06455,5.355533,0.213516
min,-1.0,-1.0,-141.0,-1.0,-36.0,-152.0,-1.0,-1.0,-1.0,0.0,...,-1.0,-37.0,-141.0,-53.0,-1.0,0.0,-147.0,-1.0,0.0,-3.0
25%,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,...,0.0,0.0,0.0,0.0,0.0,66.0,0.0,0.0,49.0,0.0


In [247]:
# Calculate the mean of each column in the 'subset_words' DataFrame
column_means = subset_words.mean()

# Create a subset of columns with mean >= |3|
selected_columns = column_means[column_means.abs() >= 3].index

# Create a new DataFrame with only the selected columns
subset_words_mean_3 = subset_words[selected_columns]

subset_words_mean_3.describe()

Unnamed: 0,caution,severe,desired,liquidation,favored,improving,stable,disappointing,tightening,unemployment,...,weakness,adverse,slow,diminished,question,imbalance,strengthening,slowing,rebounded,downturn
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-3.975,-4.083333,3.816667,-4.791667,6.0125,7.108333,5.933333,-3.854167,-22.875,-23.754167,...,-28.266667,-11.283333,-8.441667,-5.2625,-8.616667,-3.875,13.808333,-15.433333,4.2875,-3.354167
std,14.327878,19.696052,13.212447,19.041859,22.626766,20.65306,14.244566,16.120609,56.657757,39.60424,...,77.153624,27.36234,18.384683,13.457272,26.702822,12.461195,40.217358,34.19333,11.234876,13.131918
min,-141.0,-254.0,0.0,-173.0,0.0,0.0,0.0,-130.0,-358.0,-344.0,...,-634.0,-209.0,-148.0,-106.0,-243.0,-105.0,0.0,-243.0,0.0,-141.0
25%,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.25,-21.0,...,-8.0,-4.0,-5.0,-3.0,-1.0,-2.0,0.0,-6.0,0.0,-1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,-2.0,-12.5,...,-2.0,-2.0,-3.0,0.0,0.0,0.0,1.0,-2.0,1.0,0.0
75%,0.0,0.0,0.25,0.0,1.0,2.0,6.0,0.0,-1.0,-6.0,...,-0.75,0.0,-1.0,0.0,0.0,0.0,3.0,-1.0,2.0,0.0
max,0.0,0.0,116.0,0.0,277.0,143.0,139.0,0.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,304.0,0.0,119.0,0.0


In [248]:
# Concatenate the vectorized DataFrame with the original dataset
second_dataset = pd.concat([dataset_adjusted, subset_words_mean_3], axis=1)

second_dataset['Date'] = pd.to_datetime(second_dataset['Date'])

# Calculate the time difference in days from the first date
second_dataset['Date'] = (second_dataset['Date'] - second_dataset['Date'].min()).dt.days

In [249]:
# Split into training, validation, and test sets
train, valid = train_test_split(second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [250]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the test dataset
test[variables_to_convert] = test[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [251]:
# Check for missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'train'
train = train.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'valid' DataFrame
missing_values_count = valid.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'valid'
valid = valid.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'test' DataFrame
missing_values_count = test.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'test'
test = test.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_test = test.drop(columns=['Difference', 'Increase', 'Decrease'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [252]:
# Set the random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
second_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = second_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = second_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

# Output the random seed
print("Random seed:", random_seed)

Random seed: 42


In [253]:
# Get the feature importances from the best model
feature_importances = second_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

96

In [254]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

threshold = 0.1
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00292969 |        0.0364583 |
+--------------+----------------+------------------+
| RMSE         |     0.0541266  |        0.190941  |
+--------------+----------------+------------------+
| R^2          |     0.926826   |        0.439065  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.818491   |        1.14177   |
+--------------+----------------+------------------+
| Accuracy     |     0.953125   |        0.666667  |
+--------------+----------------+------------------+


In [255]:
# Subset numerical columns
numerical_columns = second_dataset.select_dtypes(include='number')

# Remove 'Difference', 'Increase', and 'Decrease' from numerical columns
columns_to_exclude = ['Difference', 'Increase', 'Decrease']
numerical_columns_subset = numerical_columns.drop(columns_to_exclude, axis=1)

# Fill missing values with the mean in numerical columns
numerical_columns_subset.fillna(numerical_columns_subset.mean(), inplace=True)

# Fill missing values with the mean in 'Difference'
difference_mean = second_dataset['Difference'].mean()
second_dataset['Difference'].fillna(difference_mean, inplace=True)

# Define the number of features to select
k = 82

# Perform univariate selection using f_regression
selector = SelectKBest(score_func=f_regression, k=k)
selected_features = selector.fit_transform(numerical_columns_subset, second_dataset['Difference'])

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_features_names = numerical_columns_subset.columns[selected_indices]

# Get the feature scores
feature_scores = selector.scores_[selected_indices]

# Standardize the feature scores
scaler = StandardScaler()
standardized_scores = scaler.fit_transform(feature_scores.reshape(-1, 1))

# Combine selected feature names and their standardized scores
selected_features_with_scores = list(zip(selected_features_names, standardized_scores))

# Sort the selected features by the absolute value of standardized scores in descending order
selected_features_with_scores.sort(key=lambda x: abs(x[1]), reverse=True)

# Print the selected features and their standardized relevance scores in descending order
print("Selected Features and Absolute Standardized Relevance Scores (Descending Order):")
for feature, score in selected_features_with_scores:
    print(f"{feature}: {score[0]}")

Selected Features and Absolute Standardized Relevance Scores (Descending Order):
Short-Term Treasury Diff: 8.91927189306211
LAG_RollingMean: 0.35773651450958127
easing: 0.33990792576034856
LAG: 0.3150120542692436
weaker: 0.30616666956121863
weakness: 0.23855274810032798
Nonfarm Payroll: -0.22626563921988915
Long-Term Treasury Bond Rate: -0.2232603689961976
desired: -0.2224070672810142
boost: -0.22109672801206368
slowly: -0.2210073315188488
problem: -0.21877354224871173
Level: -0.2164977239764715
deficit: -0.21628520676205676
Date: -0.21602247263277677
volatility: -0.21239410077078177
weakened: -0.20991963417539647
strengthening: -0.20960934638145778
caution: -0.2030745216928687
good: -0.2021213851791409
severe: -0.20189352532336366
lagged: -0.20121125470590306
LEI: -0.19952348632385147
Short-Term Treasury Bond Rate: -0.19850645022320904
Treasury Deposits: -0.19767775354621783
progress: -0.194114367784898
Unemployment Rate: -0.19304767473666912
absence: -0.19285151437827744
Bank Reserve

In [305]:
# Extract features with importance greater than the absolute value of 0.2
selected_features_greater_than_0_2 = [feature for feature, score in selected_features_with_scores if abs(score[0]) > 0.2]

# Include the 'Difference' variable in the selected features
selected_features_greater_than_0_2.append('Difference')

# Create a subset of 'second_dataset' with the specified variables
subset_second_dataset = second_dataset[selected_features_greater_than_0_2]

In [306]:
subset_second_dataset

Unnamed: 0,Short-Term Treasury Diff,LAG_RollingMean,easing,LAG,weaker,weakness,Nonfarm Payroll,Long-Term Treasury Bond Rate,desired,boost,...,deficit,Date,volatility,weakened,strengthening,caution,good,severe,lagged,Difference
0,-0.02,0.384181,0,0.469678,0,-5,110570.0,6.04,0,0,...,-9,0,0,-1,2,-2,11,0,0,0.00
1,0.02,-0.094047,-3,-0.028876,-2,-4,111060.0,5.81,0,0,...,-5,50,0,0,0,0,9,0,0,-0.50
2,0.06,0.180436,0,0.100506,0,-1,111209.0,5.68,0,1,...,-5,91,0,0,3,-1,5,-1,0,0.00
3,-0.06,0.581703,-1,1.673479,0,-4,111455.0,5.36,0,1,...,-2,126,0,0,2,-1,10,0,0,0.00
4,0.00,1.171385,0,1.740168,-1,-1,111989.0,5.72,0,0,...,-3,182,0,0,6,0,7,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.89,-1.181303,-2,-2.092038,-4,0,154006.0,3.98,0,0,...,-1,10760,-12,-1,0,0,3,0,0,0.75
236,0.20,-2.196739,-6,-3.592363,-2,0,154535.0,3.62,0,0,...,-1,10802,-5,0,0,-1,9,-1,-1,0.50
237,0.38,-2.272438,-3,-1.132914,-3,-1,155007.0,3.53,0,2,...,0,10851,-2,0,0,0,5,-1,-1,0.25
238,-0.21,-2.666227,-5,-3.273404,0,0,155472.0,3.66,0,0,...,-2,10900,-4,0,0,-2,7,-1,0,0.25


In [307]:
# Split into training, validation, and test sets
train, valid = train_test_split(subset_second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [308]:
# Prepare the data for the model
X_train = train.drop(columns=['Difference'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [309]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    subset_second_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = subset_second_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = subset_second_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
subset_second_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = subset_second_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Featurees Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

Number of Featurees Used: 20
Random Seed Used: 2823020463


In [310]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00260417 |        0.0260417 |
+--------------+----------------+------------------+
| RMSE         |     0.051031   |        0.161374  |
+--------------+----------------+------------------+
| R^2          |     0.934957   |        0.599332  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.926489   |       -8.21536   |
+--------------+----------------+------------------+
| Accuracy     |     0.958333   |        0.708333  |
+--------------+----------------+------------------+


In [311]:
# Create a third subset incorporating designated variables from subset_second_dataset
third_subset = subset_second_dataset[['Short-Term Treasury Diff', 'LAG_RollingMean', 'LAG', 'Nonfarm Payroll', 'Long-Term Treasury Bond Rate', 'Level', 'Date', 'Difference']].copy()

# List of vectorized terms (previously sentiment columns)
vectorized_terms = [col for col in subset_second_dataset.columns if col not in ['Short-Term Treasury Diff', 'LAG_RollingMean', 'LAG', 'Nonfarm Payroll', 'Long-Term Treasury Bond Rate', 'Level', 'Date', 'Difference']]

# Introduce new variables by computing the product of vectorized terms with specified columns
for term in vectorized_terms:
    for column in ['Short-Term Treasury Diff', 'LAG_RollingMean', 'LAG', 'Nonfarm Payroll', 'Level']:
        new_column_name = f'{term}_x_{column.replace(" ", "_")}'
        if term in subset_second_dataset.columns:
            third_subset[new_column_name] = subset_second_dataset[term] * subset_second_dataset[column]

# Append the 'Date' variable from subset_second_dataset at this stage
third_subset['Date'] = subset_second_dataset['Date']

# Display initial records of the third subset
print(third_subset.head())

   Short-Term Treasury Diff  LAG_RollingMean       LAG  Nonfarm Payroll  \
0                     -0.02         0.384181  0.469678         110570.0   
1                      0.02        -0.094047 -0.028876         111060.0   
2                      0.06         0.180436  0.100506         111209.0   
3                     -0.06         0.581703  1.673479         111455.0   
4                      0.00         1.171385  1.740168         111989.0   

   Long-Term Treasury Bond Rate  Level  Date  Difference  \
0                          6.04   3.75     0         0.0   
1                          5.81   3.25    50        -0.5   
2                          5.68   3.25    91         0.0   
3                          5.36   3.25   126         0.0   
4                          5.72   3.25   182         0.0   

   easing_x_Short-Term_Treasury_Diff  easing_x_LAG_RollingMean  ...  \
0                              -0.00                  0.000000  ...   
1                              -0.06          

In [312]:
# Split into training, validation, and test sets
train, valid = train_test_split(third_subset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [313]:
# Prepare the data for the model
X_train = train.drop(columns=['Difference'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [314]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    subset_third_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = subset_third_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = subset_third_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
subset_third_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Features Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

Number of Features Used: 33
Random Seed Used: 2823020463


In [315]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00846354 |        0.0390625 |
+--------------+----------------+------------------+
| RMSE         |     0.0919975  |        0.197642  |
+--------------+----------------+------------------+
| R^2          |     0.788609   |        0.398998  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.629581   |        1.23429   |
+--------------+----------------+------------------+
| Accuracy     |     0.880208   |        0.708333  |
+--------------+----------------+------------------+


In [316]:
# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the selected feature indices
selected_indices = np.where(feature_importances > 0)[0]

# Get the selected feature names
selected_features_names = X_train.columns[selected_indices]

# Print the selected feature names
print("Selected Features:")
for feature in selected_features_names:
    print(feature)

Selected Features:
Short-Term Treasury Diff
LAG_RollingMean
LAG
Nonfarm Payroll
Long-Term Treasury Bond Rate
Level
Date
easing_x_Short-Term_Treasury_Diff
easing_x_LAG_RollingMean
easing_x_LAG
easing_x_Nonfarm_Payroll
weaker_x_Short-Term_Treasury_Diff
weaker_x_LAG
weakness_x_Level
desired_x_Level
boost_x_Short-Term_Treasury_Diff
boost_x_Level
slowly_x_Short-Term_Treasury_Diff
problem_x_Nonfarm_Payroll
deficit_x_LAG_RollingMean
deficit_x_LAG
deficit_x_Nonfarm_Payroll
volatility_x_Level
weakened_x_LAG
strengthening_x_LAG_RollingMean
good_x_LAG
good_x_Nonfarm_Payroll
severe_x_LAG
lagged_x_Short-Term_Treasury_Diff
lagged_x_LAG_RollingMean
lagged_x_LAG
lagged_x_Nonfarm_Payroll
lagged_x_Level


In [319]:
# Extract features with importance greater than the absolute value of 0.2
selected_features_greater_than_0_2 = [feature for feature, score in selected_features_with_scores if abs(score[0]) > 0.2]

# Include the 'Difference' variable in the selected features
selected_features_greater_than_0_2.append('Difference')

# Create a subset of 'second_dataset' with the specified variables
subset_second_dataset = second_dataset[selected_features_greater_than_0_2]

In [320]:
# Split into training, validation, and test sets
train, valid = train_test_split(subset_second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [321]:
# Prepare the data for the model
X_train = train.drop(columns=['Difference'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [322]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_test_pred = None  # Changed variable name from best_y_valid_pred
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    subset_third_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = subset_third_rf_model.predict(X_train)

    # Predict y_test_pred on the test set  # Changed variable name from y_valid_pred
    y_test_pred = subset_third_rf_model.predict(X_test)  # Use 'test' set instead of 'valid'

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_test_pred = [round_to_nearest(val, possible_values) for val in y_test_pred]

    # Calculate accuracy for training and test sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_test = calculate_accuracy(y_test, y_test_pred, threshold)  # Use 'test' set instead of 'valid'
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_test > best_accuracy:  # Change from accuracy_valid to accuracy_test
        best_accuracy = accuracy_test  # Change from accuracy_valid to accuracy_test
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_test_pred = y_test_pred  # Changed variable name from best_y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
subset_third_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Features Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

Number of Features Used: 21
Random Seed Used: 2823020463


In [323]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Mean Squared Error (MSE) for test set
mse_test = mean_squared_error(y_test, best_y_test_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate Root Mean Squared Error (RMSE) for test set
rmse_test = np.sqrt(mse_test)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate R-squared (R^2) for test set
r2_test = r2_score(y_test, best_y_test_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate adjusted R-squared for test set
n_test = X_test.shape[0]
p_test = X_test.shape[1]
adj_r2_test = 1 - ((1 - r2_test) * (n_test - 1) / (n_test - p_test - 1))

# Calculate accuracy for training, validation, and test sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)
accuracy_test = calculate_accuracy(y_test, best_y_test_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid, mse_test],
    ["RMSE", rmse_train, rmse_valid, rmse_test],
    ["R^2", r2_train, r2_valid, r2_test],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid, adj_r2_test],
    ["Accuracy", accuracy_train, accuracy_valid, accuracy_test],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set", "Test Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+-------------+
| Metric       |   Training Set |   Validation Set |    Test Set |
| MSE          |     0.00716146 |        0.0390625 |   0.0572917 |
+--------------+----------------+------------------+-------------+
| RMSE         |     0.0846254  |        0.197642  |   0.239357  |
+--------------+----------------+------------------+-------------+
| R^2          |     0.821131   |        0.398998  |   0.297872  |
+--------------+----------------+------------------+-------------+
| Adjusted R^2 |     0.797846   |      -12.823     | -15.1489    |
+--------------+----------------+------------------+-------------+
| Accuracy     |     0.885417   |        0.708333  |   0.708333  |
+--------------+----------------+------------------+-------------+


In [324]:
# Save the best_rf_model to the specified location
model_filename = r"C:\Users\abact\BC-Project\models\best_random_forest_model_data.joblib"
joblib.dump(subset_third_rf_model, model_filename)

['C:\\Users\\abact\\BC-Project\\models\\best_random_forest_model_data.joblib']

In [325]:
# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the selected feature indices
selected_indices = np.where(feature_importances > 0)[0]

# Get the selected feature names
selected_features_names = X_train.columns[selected_indices]

# Print the selected feature names
print("Selected Features:")
for feature in selected_features_names:
    print(feature)

Selected Features:
Short-Term Treasury Diff
LAG_RollingMean
easing
LAG
weaker
weakness
Nonfarm Payroll
Long-Term Treasury Bond Rate
desired
boost
slowly
problem
Level
deficit
Date
volatility
weakened
strengthening
good
severe
lagged
