In [1]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import random
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [93]:
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/dataset_adjusted.csv?token=GHSAT0AAAAAACC4ZCNLXSOPVJD4ZPTZE3OUZGYJQ3A"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    dataset_adjusted = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'dataset_adjusted' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/raw/dataset_words.csv?token=GHSAT0AAAAAACC4ZCNL775NH4NITFXZSX7AZGYJRFQ"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'valid' variable
    words = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'words' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

Failed to download the dataset. Status code: 404
Failed to download the dataset. Status code: 404


In [94]:
# Load the sentiment word list from the CSV file into a dictionary
sentiment_word_list = {}
with open(r"C:\Users\abact\BC-Project\data\external\Loughran-McDonald_MasterDictionary_1993-2021.csv", 'r') as file:
    # Skip the header line
    next(file)

    for line in file:
        values = line.strip().split(',')

        # Extract the necessary values
        word = values[0].lower()
        positive = float(values[8])  # Positive column index is 8
        negative = float(values[7])  # Negative column index is 7

        # Assign the word as positive or negative based on the positive or negative values
        if positive == 2009:
            sentiment_word_list[word] = 1
        elif negative == 2009:
            sentiment_word_list[word] = -1

# Convert the sentiment word list keys to lowercase
selected_words = set(sentiment_word_list.keys())

# Filter the 'words' DataFrame to include only columns that are present in both 'selected_words' and 'words'
common_columns = selected_words.intersection(words.columns)
subset_words = words[list(common_columns)].copy()

# Multiply sentiment values to the vectorized text columns in the 'subset_words' DataFrame
for column in subset_words.columns:
    sentiment_value = sentiment_word_list.get(column, 0)
    if sentiment_value == 1:
        subset_words.loc[:, column] = subset_words[column] * 1  # Multiply by 1 for positive sentiment
    elif sentiment_value == -1:
        subset_words.loc[:, column] = subset_words[column] * -1  # Multiply by -1 for negative sentiment
    else:
        subset_words.loc[:, column] = subset_words[column] * 0  # Multiply by 0 for unknown sentiment

In [95]:
# Concatenate the vectorized DataFrame with the original dataset
full_dataset = pd.concat([dataset_adjusted, subset_words], axis=1)

full_dataset['Date'] = pd.to_datetime(full_dataset['Date'])

# Calculate the time difference in days from the first date
full_dataset['Date'] = (full_dataset['Date'] - full_dataset['Date'].min()).dt.days

In [61]:
# Check for the number of missing values in 'train' DataFrame
missing_values_count = full_dataset.isna().sum()

# Print the count of missing values for each column
print(missing_values_count)

Federal_Reserve_Mins    0
Preprocessed Text       0
Date                    0
Difference              0
Increase                0
                       ..
enhancing               0
dissent                 0
delay                   0
recessionary            0
enjoy                   0
Length: 837, dtype: int64


In [96]:
# Filter columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Print columns with missing values and their counts
print("Columns with Missing Values:")
for column, count in columns_with_missing_values.items():
    print(f"{column}: {count}")

Columns with Missing Values:


In [97]:
# Split into training, validation, and test sets
train, valid = train_test_split(full_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [98]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the test dataset
test[variables_to_convert] = test[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [66]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

possible_values = [-1.00, -0.75, -0.50, -0.25, 0.00, 0.25, 0.50, 0.75, 1.00]

def round_to_nearest(value, possible_values):
    return min(possible_values, key=lambda x: abs(x - value))

In [99]:
# Check for missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'train'
train = train.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'valid' DataFrame
missing_values_count = valid.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'valid'
valid = valid.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'test' DataFrame
missing_values_count = test.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'test'
test = test.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_test = test.drop(columns=['Difference', 'Increase', 'Decrease'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [100]:
# Check for the number of missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Print the count of missing values for each column
print(missing_values_count)

# Filter columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Print columns with missing values and their counts
print("Columns with Missing Values:")
for column, count in columns_with_missing_values.items():
    print(f"{column}: {count}")

Date            0
Difference      0
Increase        0
Decrease        0
Level           0
               ..
enhancing       0
dissent         0
delay           0
recessionary    0
enjoy           0
Length: 833, dtype: int64
Columns with Missing Values:


In [13]:
# Set the random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Create a RandomForestRegressor instance (you need to adjust hyperparameters)
rf_model = RandomForestRegressor()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = best_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = best_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

# Output the random seed
print("Random seed:", random_seed)

Random seed: 42


In [14]:
# Get the feature importances from the best model
feature_importances = best_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

143

In [15]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00325521 |        0.0364583 |
+--------------+----------------+------------------+
| RMSE         |     0.0570544  |        0.190941  |
+--------------+----------------+------------------+
| R^2          |     0.918696   |        0.439065  |
+--------------+----------------+------------------+
| Adjusted R^2 |     1.0243     |        1.01599   |
+--------------+----------------+------------------+
| Accuracy     |     0.947917   |        0.666667  |
+--------------+----------------+------------------+


In [17]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    # Set the random seed for reproducibility
    random_seed = cv  # Use cv as the random seed
    random_seeds.append(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    best_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = best_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = best_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the random seeds used in each iteration
print("Random Seeds:", random_seeds)

Random Seeds: [2, 3, 4, 5, 6, 7, 8, 9, 10]


In [18]:
# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

115

In [19]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

threshold = 0.1
accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00520833 |        0.0286458 |
+--------------+----------------+------------------+
| RMSE         |     0.0721688  |        0.169251  |
+--------------+----------------+------------------+
| R^2          |     0.869913   |        0.559265  |
+--------------+----------------+------------------+
| Adjusted R^2 |     1.03888    |        1.01256   |
+--------------+----------------+------------------+
| Accuracy     |     0.932292   |        0.666667  |
+--------------+----------------+------------------+


###Data Centric AI

In [101]:
subset_words.describe()

Unnamed: 0,optimistic,encouraged,persists,overstate,hazard,discontinuing,breaching,cutback,purported,forestall,...,depress,idle,omit,questioning,unprofitable,enhancing,dissent,delay,recessionary,enjoy
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,2.379167,0.4375,-0.004167,-0.0125,-0.029167,-0.016667,-0.004167,-2.05,-0.008333,-1.158333,...,-0.741667,-0.291667,-0.133333,-0.141667,-0.004167,0.545833,-0.016667,-2.054167,-0.008333,0.104167
std,9.054113,3.193261,0.06455,0.111335,0.280435,0.203868,0.06455,11.035773,0.091096,5.888072,...,4.839682,3.252024,1.686438,2.130849,0.06455,3.783633,0.128287,7.109513,0.091096,1.550268
min,0.0,0.0,-1.0,-1.0,-3.0,-3.0,-1.0,-111.0,-1.0,-60.0,...,-53.0,-42.0,-26.0,-33.0,-1.0,0.0,-1.0,-50.0,-1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,66.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,41.0,0.0,0.0,0.0,24.0


In [102]:
# Calculate the mean of each column in the 'subset_words' DataFrame
column_means = subset_words.mean()

# Create a subset of columns with mean >= |3|
selected_columns = column_means[column_means.abs() >= 3].index

# Create a new DataFrame with only the selected columns
subset_words_mean_3 = subset_words[selected_columns]

subset_words_mean_3.describe()

Unnamed: 0,volatile,rebounded,lagged,stronger,downward,cut,difficult,desirable,imbalance,declining,...,volatility,positive,declined,boost,weakening,contraction,favorable,decline,negative,gain
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-4.633333,4.2875,-4.508333,5.333333,-8.958333,-4.858333,-6.954167,6.370833,-3.875,-10.595833,...,-3.808333,14.870833,-24.683333,5.666667,-5.979167,-3.6875,24.025,-54.7375,-9.008333,39.9625
std,13.597584,11.234876,14.828445,11.784561,22.059598,13.525303,17.105199,19.524901,12.461195,26.71904,...,9.109557,35.800762,46.589363,13.691663,18.116418,12.240198,48.97595,108.182676,22.721157,68.129972
min,-94.0,0.0,-143.0,0.0,-185.0,-100.0,-110.0,0.0,-105.0,-248.0,...,-73.0,0.0,-470.0,0.0,-140.0,-88.0,0.0,-891.0,-149.0,0.0
25%,-2.0,0.0,-1.0,0.0,-4.0,-2.0,-2.0,0.0,-2.0,-4.0,...,-3.0,0.0,-19.0,0.0,-1.0,-1.0,0.0,-38.25,-3.0,6.75
50%,0.0,1.0,0.0,1.0,-2.0,0.0,-1.0,0.0,0.0,-1.0,...,-1.0,2.0,-10.0,1.0,0.0,0.0,2.0,-14.0,-1.0,9.0
75%,0.0,2.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,5.25,-6.0,2.0,0.0,0.0,20.0,-8.0,0.0,30.25
max,0.0,119.0,0.0,75.0,0.0,0.0,0.0,196.0,0.0,0.0,...,0.0,239.0,0.0,89.0,0.0,0.0,323.0,0.0,0.0,353.0


In [103]:
# Concatenate the vectorized DataFrame with the original dataset
second_dataset = pd.concat([dataset_adjusted, subset_words_mean_3], axis=1)

second_dataset['Date'] = pd.to_datetime(second_dataset['Date'])

# Calculate the time difference in days from the first date
second_dataset['Date'] = (second_dataset['Date'] - second_dataset['Date'].min()).dt.days

In [104]:
# Split into training, validation, and test sets
train, valid = train_test_split(second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [105]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the test dataset
test[variables_to_convert] = test[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [74]:
# Check for missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'train'
train = train.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'valid' DataFrame
missing_values_count = valid.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'valid'
valid = valid.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'test' DataFrame
missing_values_count = test.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'test'
test = test.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_test = test.drop(columns=['Difference', 'Increase', 'Decrease'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [None]:
# Set the random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
second_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = second_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = second_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

# Output the random seed
print("Random seed:", random_seed)

In [None]:
# Get the feature importances from the best model
feature_importances = second_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

In [None]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

threshold = 0.1
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

In [76]:
second_dataset

Unnamed: 0,Federal_Reserve_Mins,Preprocessed Text,Date,Difference,Increase,Decrease,Level,CPI,Unemployment Rate,Consumer Sentiment,...,volatility,positive,declined,boost,weakening,contraction,favorable,decline,negative,gain
0,A meeting of the Federal Open Market Committee...,meeting federal open market committee held off...,0,0.00,0.00,0.5,3.75,144.200,7.1,80.3,...,0,4,-3,0,-3,-1,6,-11,-1,10
1,A meeting of the Federal Open Market Committee...,meeting federal open market committee held off...,50,-0.50,0.00,0.0,3.25,144.500,6.9,77.0,...,0,2,-5,0,0,0,6,-8,-2,6
2,A meeting of the Federal Open Market Committee...,meeting federal open market committee held off...,91,0.00,0.00,0.0,3.25,144.800,6.8,77.3,...,0,1,-8,1,0,0,6,-6,-3,4
3,A meeting of the Federal Open Market Committee...,meeting federal open market committee held off...,126,0.00,0.00,0.0,3.25,145.000,6.7,77.9,...,0,1,-5,1,0,0,6,-13,-1,9
4,A meeting of the Federal Open Market Committee...,meeting federal open market committee held off...,182,0.00,0.00,0.0,3.25,146.000,6.6,81.2,...,0,0,-1,0,0,0,4,-5,-2,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,"The Federal Reserve, the central bank of the U...",federal reserve central bank united state prov...,10760,0.75,0.50,0.0,4.00,297.987,3.7,59.9,...,-12,0,-7,0,-1,0,0,-7,-2,5
236,"The Federal Reserve, the central bank of the U...",federal reserve central bank united state prov...,10802,0.50,0.25,0.0,4.50,298.990,3.5,59.7,...,-5,1,-13,0,-2,0,0,-14,-1,7
237,"The Federal Reserve, the central bank of the U...",federal reserve central bank united state prov...,10851,0.25,0.25,0.0,4.75,300.536,3.4,64.9,...,-2,0,-10,2,0,0,0,-13,-1,5
238,"The Federal Reserve, the central bank of the U...",federal reserve central bank united state prov...,10900,0.25,0.25,0.0,5.00,301.808,3.5,62.0,...,-4,0,-8,0,-2,0,0,-9,0,7


In [78]:
# Subset numerical columns
numerical_columns = second_dataset.select_dtypes(include='number')

# Remove 'Difference', 'Increase', and 'Decrease' from numerical columns
columns_to_exclude = ['Difference', 'Increase', 'Decrease']
numerical_columns_subset = numerical_columns.drop(columns_to_exclude, axis=1)

# Fill missing values with the mean in numerical columns
numerical_columns_subset.fillna(numerical_columns_subset.mean(), inplace=True)

# Fill missing values with the mean in 'Difference'
difference_mean = second_dataset['Difference'].mean()
second_dataset['Difference'].fillna(difference_mean, inplace=True)

# Define the number of features to select
k = 114

# Perform univariate selection using f_regression
selector = SelectKBest(score_func=f_regression, k=k)
selected_features = selector.fit_transform(numerical_columns_subset, second_dataset['Difference'])

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_features_names = numerical_columns_subset.columns[selected_indices]

# Get the feature scores
feature_scores = selector.scores_[selected_indices]

# Standardize the feature scores
scaler = StandardScaler()
standardized_scores = scaler.fit_transform(feature_scores.reshape(-1, 1))

# Combine selected feature names and their standardized scores
selected_features_with_scores = list(zip(selected_features_names, standardized_scores))

# Sort the selected features by the absolute value of standardized scores in descending order
selected_features_with_scores.sort(key=lambda x: abs(x[1]), reverse=True)

# Print the selected features and their standardized relevance scores in descending order
print("Selected Features and Absolute Standardized Relevance Scores (Descending Order):")
for feature, score in selected_features_with_scores:
    print(f"{feature}: {score[0]}")

Selected Features and Absolute Standardized Relevance Scores (Descending Order):
Short-Term Treasury Diff: 10.46605321524853
Proportion Negative Words: 0.7607458083403843
LAG_RollingMean: 0.48261304866862675
easing: 0.46182347750381014
LAG: 0.43279288515831177
weaker: 0.42247845380010396
weakness: 0.34363517283631523
contraction: 0.2624072756180427
Consumer Sentiment: -0.2528868630117049
improve: -0.2501082557551958
boosted: -0.2495308888834817
persisting: -0.2494660135029889
Proportion Positive Words: -0.24087144280941072
strengthen: -0.23831170451472158
concerned: -0.23184765784744207
turmoil: -0.23171464634280825
strong: -0.22796609772863247
slowing: -0.22351326589942916
improving: -0.2232410365460709
slower: -0.2230806666261028
question: -0.22161881586384774
imbalance: -0.22104161629379762
diminishing: -0.2188573162603204
stability: -0.21769361000936252
slow: -0.21530161731721756
Housing Sales: -0.21423074892979543
slowed: -0.21235192291064128
better: -0.21158024211084825
gain: -0.

In [79]:
# Extract features with importance greater than the absolute value of 0.15
selected_features_greater_than_0_15 = [feature for feature, score in selected_features_with_scores if abs(score[0]) > 0.15]

# Include the 'Difference' variable in the selected features
selected_features_greater_than_0_15.append('Difference')

# Create a subset of 'second_dataset' with the specified variables
subset_second_dataset = second_dataset[selected_features_greater_than_0_15]

In [80]:
# Split into training, validation, and test sets
train, valid = train_test_split(subset_second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [81]:
# Prepare the data for the model
X_train = train.drop(columns=['Difference'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [82]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    subset_second_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = subset_second_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = subset_second_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
subset_second_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = subset_second_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Featurees Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

Number of Featurees Used: 68
Random Seed Used: 1649622067


In [83]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00585938 |        0.0286458 |
+--------------+----------------+------------------+
| RMSE         |     0.0765466  |        0.169251  |
+--------------+----------------+------------------+
| R^2          |     0.853652   |        0.559265  |
+--------------+----------------+------------------+
| Adjusted R^2 |     0.768988   |        1.21568   |
+--------------+----------------+------------------+
| Accuracy     |     0.90625    |        0.666667  |
+--------------+----------------+------------------+


In [85]:
# Create a third subset incorporating designated variables from subset_second_dataset
third_subset = subset_second_dataset[['Short-Term Treasury Diff', 'LAG_RollingMean', 'LAG', 'Nonfarm Payroll', 'Long-Term Treasury Bond Rate', 'Level', 'Date', 'Difference']].copy()

# List of vectorized terms (previously sentiment columns)
vectorized_terms = [col for col in subset_second_dataset.columns if col not in ['Short-Term Treasury Diff', 'LAG_RollingMean', 'LAG', 'Nonfarm Payroll', 'Long-Term Treasury Bond Rate', 'Level', 'Date', 'Difference']]

# Create a list to store DataFrames of new columns
new_columns_dfs = []

# Introduce new variables by computing the product of vectorized terms with specified columns
for term in vectorized_terms:
    for column in ['Short-Term Treasury Diff', 'LAG_RollingMean', 'LAG', 'Nonfarm Payroll', 'Level']:
        new_column_name = f'{term}_x_{column.replace(" ", "_")}'
        if term in subset_second_dataset.columns:
            new_column = subset_second_dataset[term] * subset_second_dataset[column]
            new_columns_dfs.append(pd.DataFrame({new_column_name: new_column}))

# Concatenate the new columns DataFrames along the columns axis
new_columns_df = pd.concat(new_columns_dfs, axis=1)

# Combine the new columns DataFrame with the existing 'third_subset'
third_subset = pd.concat([third_subset, new_columns_df], axis=1)

# Append the 'Date' variable from subset_second_dataset at this stage
third_subset['Date'] = subset_second_dataset['Date']

# Display initial records of the third subset
print(third_subset.head())

   Short-Term Treasury Diff  LAG_RollingMean       LAG  Nonfarm Payroll  \
0                     -0.02         0.384181  0.469678         110570.0   
1                      0.02        -0.094047 -0.028876         111060.0   
2                      0.06         0.180436  0.100506         111209.0   
3                     -0.06         0.581703  1.673479         111455.0   
4                      0.00         1.171385  1.740168         111989.0   

   Long-Term Treasury Bond Rate  Level  Date  Difference  \
0                          6.04   3.75     0         0.0   
1                          5.81   3.25    50        -0.5   
2                          5.68   3.25    91         0.0   
3                          5.36   3.25   126         0.0   
4                          5.72   3.25   182         0.0   

   Proportion Negative Words_x_Short-Term_Treasury_Diff  \
0                                          -0.000525      
1                                           0.000588      
2          

In [86]:
# Split into training, validation, and test sets
train, valid = train_test_split(third_subset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [87]:
# Prepare the data for the model
X_train = train.drop(columns=['Difference'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [None]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    subset_third_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = subset_third_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = subset_third_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
subset_third_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Features Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

In [None]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

In [88]:
# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the selected feature indices
selected_indices = np.where(feature_importances > 0)[0]

# Get the selected feature names
selected_features_names = X_train.columns[selected_indices]

# Print the selected feature names
print("Selected Features:")
for feature in selected_features_names:
    print(feature)

NameError: name 'subset_third_rf_model' is not defined

In [None]:
# Extract features with importance greater than the absolute value of 0.2
selected_features_greater_than_0_2 = [feature for feature, score in selected_features_with_scores if abs(score[0]) > 0.2]

# Include the 'Difference' variable in the selected features
selected_features_greater_than_0_2.append('Difference')

# Create a subset of 'second_dataset' with the specified variables
subset_second_dataset = second_dataset[selected_features_greater_than_0_2]

In [None]:
# Split into training, validation, and test sets
train, valid = train_test_split(subset_second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

In [None]:
# Prepare the data for the model
X_train = train.drop(columns=['Difference'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [None]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_test_pred = None  # Changed variable name from best_y_valid_pred
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    subset_third_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = subset_third_rf_model.predict(X_train)

    # Predict y_test_pred on the test set  # Changed variable name from y_valid_pred
    y_test_pred = subset_third_rf_model.predict(X_test)  # Use 'test' set instead of 'valid'

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_test_pred = [round_to_nearest(val, possible_values) for val in y_test_pred]

    # Calculate accuracy for training and test sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_test = calculate_accuracy(y_test, y_test_pred, threshold)  # Use 'test' set instead of 'valid'
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_test > best_accuracy:  # Change from accuracy_valid to accuracy_test
        best_accuracy = accuracy_test  # Change from accuracy_valid to accuracy_test
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_test_pred = y_test_pred  # Changed variable name from best_y_valid_pred

    # Store the random seed used in this iteration
    random_seeds.append(np.random.get_state()[1][0])

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
subset_third_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the number of features used in the last iteration
print("Number of Features Used:", num_features_used)

# Extract the random seed from the last element of the random_seeds list
random_seed_used = random_seeds[-1]

# Print the random seed used in the last iteration
print("Random Seed Used:", random_seed_used)

In [None]:
def calculate_accuracy(y_true, y_pred, threshold=None):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    if threshold is not None:
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= threshold:
                correct_predictions += 1
    else:
        for true_val, pred_val in zip(y_true, y_pred):
            if true_val == pred_val:
                correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, best_y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, best_y_valid_pred)

# Calculate Mean Squared Error (MSE) for test set
mse_test = mean_squared_error(y_test, best_y_test_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate Root Mean Squared Error (RMSE) for test set
rmse_test = np.sqrt(mse_test)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, best_y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, best_y_valid_pred)

# Calculate R-squared (R^2) for test set
r2_test = r2_score(y_test, best_y_test_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate adjusted R-squared for test set
n_test = X_test.shape[0]
p_test = X_test.shape[1]
adj_r2_test = 1 - ((1 - r2_test) * (n_test - 1) / (n_test - p_test - 1))

# Calculate accuracy for training, validation, and test sets with the threshold
accuracy_train = calculate_accuracy(y_train, best_y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, best_y_valid_pred, threshold)
accuracy_test = calculate_accuracy(y_test, best_y_test_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid, mse_test],
    ["RMSE", rmse_train, rmse_valid, rmse_test],
    ["R^2", r2_train, r2_valid, r2_test],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid, adj_r2_test],
    ["Accuracy", accuracy_train, accuracy_valid, accuracy_test],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set", "Test Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

In [None]:
# Save the best_rf_model to the specified location
model_filename = r"C:\Users\abact\BC-Project\models\best_random_forest_model_data.joblib"
joblib.dump(subset_third_rf_model, model_filename)

In [None]:
# Get the feature importances from the best model
feature_importances = subset_third_rf_model.feature_importances_

# Get the selected feature indices
selected_indices = np.where(feature_importances > 0)[0]

# Get the selected feature names
selected_features_names = X_train.columns[selected_indices]

# Print the selected feature names
print("Selected Features:")
for feature in selected_features_names:
    print(feature)

In [91]:
# Save X_train as a CSV file
X_train.to_csv(r"C:\Users\abact\BC-Project\data\processed\X_train.csv", index=False)

# Save u_train as a CSV file
y_train.to_csv(r"C:\Users\abact\BC-Project\data\processed\y_train.csv", index=False)

In [92]:
# Save X_valid as a CSV file
X_valid.to_csv(r"C:\Users\abact\BC-Project\data\processed\X_valid.csv", index=False)

# Save X_test as a CSV file
X_test.to_csv(r"C:\Users\abact\BC-Project\data\processed\X_test.csv", index=False)

# Save y_valid as a CSV file
y_valid.to_csv(r"C:\Users\abact\BC-Project\data\processed\y_valid.csv", index=False)

# Save y_test as a CSV file
y_test.to_csv(r"C:\Users\abact\BC-Project\data\processed\y_test.csv", index=False)