In [196]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import random
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score
from tabulate import tabulate
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

In [94]:
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/dataset_adjusted.csv?token=GHSAT0AAAAAACC4ZCNKN5F6XR7HZA75QWTEZGQVYIA"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'train' variable
    dataset_adjusted = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'dataset_adjusted' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")
    
# The URL of the raw dataset on GitHub
url = "https://raw.githubusercontent.com/abactat/BC-Project/main/data/raw/dataset_words.csv?token=GHSAT0AAAAAACC4ZCNKDB2KTQ7KVOHGVHMAZGQVX7A"

# Send an HTTP GET request to fetch the content of the raw dataset
response = requests.get(url)

# Check if the request was successful (status code 200 means success)
if response.status_code == 200:
    # Read the content as a pandas DataFrame and assign it to the 'valid' variable
    words = pd.read_csv(StringIO(response.text))
    print("Dataset downloaded and loaded into 'words' successfully.")
else:
    print(f"Failed to download the dataset. Status code: {response.status_code}")

Dataset downloaded and loaded into 'dataset_adjusted' successfully.
Dataset downloaded and loaded into 'words' successfully.


In [145]:
# Load the sentiment word list from the CSV file into a dictionary
sentiment_word_list = {}
with open(r"C:\Users\abact\BC-Project\data\external\Loughran-McDonald_MasterDictionary_1993-2021.csv", 'r') as file:
    # Skip the header line
    next(file)

    for line in file:
        values = line.strip().split(',')

        # Extract the necessary values
        word = values[0].lower()
        positive = float(values[8])  # Positive column index is 8
        negative = float(values[7])  # Negative column index is 7

        # Assign the word as positive or negative based on the positive or negative values
        if positive == 2009:
            sentiment_word_list[word] = 1
        elif negative == 2009:
            sentiment_word_list[word] = -1

# Convert the sentiment word list keys to lowercase
selected_words = set(sentiment_word_list.keys())

# Filter the 'words' DataFrame to include only columns that are present in both 'selected_words' and 'words'
common_columns = selected_words.intersection(words.columns)
subset_words = words[list(common_columns)].copy()

# Multiply sentiment values to the vectorized text columns in the 'subset_words' DataFrame
for column in subset_words.columns:
    sentiment_value = sentiment_word_list.get(column, 0)
    if sentiment_value == 1:
        subset_words.loc[:, column] = subset_words[column] * 1  # Multiply by 1 for positive sentiment
    elif sentiment_value == -1:
        subset_words.loc[:, column] = subset_words[column] * -1  # Multiply by -1 for negative sentiment
    else:
        subset_words.loc[:, column] = subset_words[column] * 0  # Multiply by 0 for unknown sentiment

In [146]:
# Concatenate the vectorized DataFrame with the original dataset
full_dataset = pd.concat([dataset_adjusted, subset_words], axis=1)

full_dataset['Date'] = pd.to_datetime(full_dataset['Date'])

# Calculate the time difference in days from the first date
full_dataset['Date'] = (full_dataset['Date'] - full_dataset['Date'].min()).dt.days

In [147]:
# Check for the number of missing values in 'train' DataFrame
missing_values_count = full_dataset.isna().sum()

# Print the count of missing values for each column
print(missing_values_count)

Federal_Reserve_Mins    0
Preprocessed Text       0
Date                    0
Difference              0
Increase                0
                       ..
enhance                 0
unanticipated           0
burdensome              0
impressive              0
lose                    0
Length: 837, dtype: int64


In [148]:
# Filter columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Print columns with missing values and their counts
print("Columns with Missing Values:")
for column, count in columns_with_missing_values.items():
    print(f"{column}: {count}")

Columns with Missing Values:


In [149]:
# Split into training, validation, and test sets
train, valid = train_test_split(full_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [150]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the test dataset
test[variables_to_convert] = test[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [151]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

possible_values = [-1.00, -0.75, -0.50, -0.25, 0.00, 0.25, 0.50, 0.75, 1.00]

def round_to_nearest(value, possible_values):
    return min(possible_values, key=lambda x: abs(x - value))

In [152]:
# Check for missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'train'
train = train.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'valid' DataFrame
missing_values_count = valid.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'valid'
valid = valid.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'test' DataFrame
missing_values_count = test.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'test'
test = test.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_test = test.drop(columns=['Difference', 'Increase', 'Decrease'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [153]:
# Check for the number of missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Print the count of missing values for each column
print(missing_values_count)

# Filter columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0]

# Print columns with missing values and their counts
print("Columns with Missing Values:")
for column, count in columns_with_missing_values.items():
    print(f"{column}: {count}")

Date             0
Difference       0
Increase         0
Decrease         0
Level            0
                ..
enhance          0
unanticipated    0
burdensome       0
impressive       0
lose             0
Length: 833, dtype: int64
Columns with Missing Values:


In [165]:
# Set the random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = best_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = best_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

# Output the random seed
print("Random seed:", random_seed)

Random seed: 42


In [166]:
# Get the feature importances from the best model
feature_importances = best_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

99

In [167]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

# Calculate accuracy for training and validation sets
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00358073 |        0.0260417 |
+--------------+----------------+------------------+
| RMSE         |     0.0598392  |        0.161374  |
+--------------+----------------+------------------+
| R^2          |     0.910565   |        0.599332  |
+--------------+----------------+------------------+
| Adjusted R^2 |     1.02673    |        1.01142   |
+--------------+----------------+------------------+
| Accuracy     |     0.942708   |        0.708333  |
+--------------+----------------+------------------+


In [88]:
# List to store random seeds used in each iteration
random_seeds = []

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Function to calculate accuracy based on a threshold
def calculate_accuracy(y_true, y_pred, threshold):
    num_samples = len(y_true)
    correct_predictions = sum(abs(y_true - y_pred) <= threshold)
    return correct_predictions / num_samples

best_accuracy = -1.0
optimal_cv = None
best_y_train_pred = None
best_y_valid_pred = None
threshold = 0.1  # Define your desired threshold here

for cv in range(2, 11):  # Try cross-validation folds from 2 to 10
    # Set the random seed for reproducibility
    random_seed = cv  # Use cv as the random seed
    random_seeds.append(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    
    grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model with optimal hyperparameters
    best_rf_model = grid_search.best_estimator_

    # Predict y_train_pred on the training set
    y_train_pred = best_rf_model.predict(X_train)

    # Predict y_valid_pred on the validation set
    y_valid_pred = best_rf_model.predict(X_valid)

    # Round the predicted values to the nearest possible value
    y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
    y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

    # Calculate accuracy for training and validation sets after rounding
    accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
    accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)
    
    # Check if the accuracy after rounding is higher than the best accuracy so far
    if accuracy_valid > best_accuracy:
        best_accuracy = accuracy_valid
        optimal_cv = cv
        best_y_train_pred = y_train_pred
        best_y_valid_pred = y_valid_pred

# Use the optimal number of folds in GridSearchCV
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=optimal_cv, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
best_rf_model = grid_search.best_estimator_

# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

# Print the random seeds used in each iteration
print("Random Seeds:", random_seeds)

Random Seeds: [2, 3, 4, 5, 6, 7, 8, 9, 10]


In [89]:
# Get the feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

50

In [91]:
# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

threshold = 0.1
accuracy_train = calculate_accuracy(y_train, y_train_pred, threshold)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred, threshold)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |     0.00455729 |         0.03125  |
+--------------+----------------+------------------+
| RMSE         |     0.0675077  |         0.176777 |
+--------------+----------------+------------------+
| R^2          |     0.886174   |         0.519199 |
+--------------+----------------+------------------+
| Adjusted R^2 |     1.00323    |         1.0016   |
+--------------+----------------+------------------+
| Accuracy     |     0.927083   |         0.625    |
+--------------+----------------+------------------+


###Data Centric AI

In [157]:
subset_words.describe()

Unnamed: 0,divestiture,unfounded,caution,hampering,unnecessarily,shutdown,ineffective,trouble,conspired,enable,...,disrupt,worsened,downturn,worsen,mistaken,enhance,unanticipated,burdensome,impressive,lose
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-0.004167,-0.004167,-3.975,-0.004167,-0.154167,-0.975,-0.020833,-0.025,-0.004167,0.391667,...,-0.025,-0.570833,-3.354167,-0.941667,-0.0125,1.9,-2.7,-0.004167,0.933333,-0.020833
std,0.06455,0.06455,14.327878,0.06455,2.324416,9.95888,0.143125,0.156451,0.06455,2.726887,...,0.156451,3.247655,13.131918,5.056659,0.111335,7.92739,12.590493,0.06455,5.355533,0.213516
min,-1.0,-1.0,-141.0,-1.0,-36.0,-152.0,-1.0,-1.0,-1.0,0.0,...,-1.0,-37.0,-141.0,-53.0,-1.0,0.0,-147.0,-1.0,0.0,-3.0
25%,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0,...,0.0,0.0,0.0,0.0,0.0,66.0,0.0,0.0,49.0,0.0


In [180]:
# Calculate the mean of each column in the 'subset_words' DataFrame
column_means = subset_words.mean()

# Create a subset of columns with mean >= 3
selected_columns = column_means[column_means >= 3].index

# Create a new DataFrame with only the selected columns
subset_words_mean_3 = subset_words[selected_columns]

subset_words_mean_3.describe()

Unnamed: 0,desired,favored,improving,stable,attractive,good,desirable,opportunity,gain,boosted,...,improved,strong,better,strength,favorable,positive,improve,rebound,strengthening,rebounded
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,3.816667,6.0125,7.108333,5.933333,3.4875,41.133333,6.370833,7.495833,39.9625,6.279167,...,7.0875,34.1375,5.741667,27.708333,24.025,14.870833,3.3875,6.154167,13.808333,4.2875
std,13.212447,22.626766,20.65306,14.244566,11.478917,79.312731,19.524901,21.268359,68.129972,14.805051,...,13.886745,64.572507,13.786474,61.95651,48.97595,35.800762,8.784487,16.316598,40.217358,11.234876
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,6.75,0.0,...,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,2.0,0.0,7.0,0.0,0.0,9.0,1.0,...,3.0,8.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0
75%,0.25,1.0,2.0,6.0,0.0,37.25,1.0,1.0,30.25,3.0,...,6.0,25.25,3.0,8.0,20.0,5.25,2.0,2.0,3.0,2.0
max,116.0,277.0,143.0,139.0,89.0,643.0,196.0,175.0,353.0,96.0,...,84.0,527.0,98.0,396.0,323.0,239.0,54.0,142.0,304.0,119.0


In [181]:
# Concatenate the vectorized DataFrame with the original dataset
second_dataset = pd.concat([dataset_adjusted, subset_words_mean_3], axis=1)

second_dataset['Date'] = pd.to_datetime(second_dataset['Date'])

# Calculate the time difference in days from the first date
second_dataset['Date'] = (second_dataset['Date'] - second_dataset['Date'].min()).dt.days

In [182]:
# Split into training, validation, and test sets
train, valid = train_test_split(second_dataset, test_size=0.2, shuffle=False)

# Split the combined set into validation and test sets
valid, test = train_test_split(valid, test_size=0.5, shuffle=False)

# Verify the sizes of each set
print("Training set size:", len(train))
print("Validation set size:", len(valid))
print("Test set size:", len(test))

Training set size: 192
Validation set size: 24
Test set size: 24


In [183]:
# Convert variables to numeric in the train dataset
variables_to_convert = train.columns.drop('Date')
train[variables_to_convert] = train[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the valid dataset
valid[variables_to_convert] = valid[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Convert variables to numeric in the test dataset
test[variables_to_convert] = test[variables_to_convert].apply(pd.to_numeric, errors='coerce')

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

X_test = test.drop(columns=['Difference', 'Increase', 'Decrease', 'Date'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [184]:
# Check for missing values in 'train' DataFrame
missing_values_count = train.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'train'
train = train.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_train = train.drop(columns=['Difference', 'Increase', 'Decrease'])
y_train = train['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'valid' DataFrame
missing_values_count = valid.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'valid'
valid = valid.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_valid = valid.drop(columns=['Difference', 'Increase', 'Decrease'])
y_valid = valid['Difference']  # Use the 'Difference' variable as the target

# Check for missing values in 'test' DataFrame
missing_values_count = test.isna().sum()

# Get the list of columns with missing values
columns_with_missing_values = missing_values_count[missing_values_count > 0].index.tolist()

# Drop columns with missing values from 'test'
test = test.drop(columns=columns_with_missing_values)

# Prepare the data for the model
X_test = test.drop(columns=['Difference', 'Increase', 'Decrease'])
y_test = test['Difference']  # Use the 'Difference' variable as the target

In [188]:
# Set the random seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'n_estimators': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Create GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to the training data and find the best hyperparameters
grid_search.fit(X_train, y_train)

# Get the best model with optimal hyperparameters
second_rf_model_5 = grid_search.best_estimator_

# Predict y_train_pred on the training set
y_train_pred = second_rf_model_5.predict(X_train)

# Predict y_valid_pred on the validation set
y_valid_pred = second_rf_model_5.predict(X_valid)

# Round the predicted values to the nearest possible value
y_train_pred = [round_to_nearest(val, possible_values) for val in y_train_pred]
y_valid_pred = [round_to_nearest(val, possible_values) for val in y_valid_pred]

# Output the random seed
print("Random seed:", random_seed)

Random seed: 42


In [189]:
# Get the feature importances from the best model
feature_importances = second_rf_model_5.feature_importances_

# Get the number of features used (non-zero feature importances)
num_features_used = np.sum(feature_importances > 0)

num_features_used

31

In [190]:
def calculate_accuracy(y_true, y_pred):
    correct_predictions = 0
    total_predictions = len(y_true)
    
    for true_val, pred_val in zip(y_true, y_pred):
        if true_val == pred_val:
            correct_predictions += 1
            
    accuracy = correct_predictions / total_predictions
    return accuracy

# Calculate Mean Squared Error (MSE) for training set
mse_train = mean_squared_error(y_train, y_train_pred)

# Calculate Mean Squared Error (MSE) for validation set
mse_valid = mean_squared_error(y_valid, y_valid_pred)

# Calculate Root Mean Squared Error (RMSE) for training set
rmse_train = np.sqrt(mse_train)

# Calculate Root Mean Squared Error (RMSE) for validation set
rmse_valid = np.sqrt(mse_valid)

# Calculate R-squared (R^2) for training set
r2_train = r2_score(y_train, y_train_pred)

# Calculate R-squared (R^2) for validation set
r2_valid = r2_score(y_valid, y_valid_pred)

# Calculate adjusted R-squared for training set
n_train = X_train.shape[0]
p_train = X_train.shape[1]
adj_r2_train = 1 - ((1 - r2_train) * (n_train - 1) / (n_train - p_train - 1))

# Calculate adjusted R-squared for validation set
n_valid = X_valid.shape[0]
p_valid = X_valid.shape[1]
adj_r2_valid = 1 - ((1 - r2_valid) * (n_valid - 1) / (n_valid - p_valid - 1))

threshold = 0.1
accuracy_train = calculate_accuracy(y_train, y_train_pred)
accuracy_valid = calculate_accuracy(y_valid, y_valid_pred)

# Prepare the data for the table
data = [
    ["MSE", mse_train, mse_valid],
    ["RMSE", rmse_train, rmse_valid],
    ["R^2", r2_train, r2_valid],
    ["Adjusted R^2", adj_r2_train, adj_r2_valid],
    ["Accuracy", accuracy_train, accuracy_valid],
]

# Prepare the headers for the table
headers = ["Metric", "Training Set", "Validation Set"]

# Display the table
table = tabulate(data, headers=headers, tablefmt="grid")
print(table)

+--------------+----------------+------------------+
| Metric       |   Training Set |   Validation Set |
| MSE          |      0.0094401 |        0.0338542 |
+--------------+----------------+------------------+
| RMSE         |      0.0971602 |        0.183995  |
+--------------+----------------+------------------+
| R^2          |      0.764218  |        0.479132  |
+--------------+----------------+------------------+
| Adjusted R^2 |      0.653581  |        1.31526   |
+--------------+----------------+------------------+
| Accuracy     |      0.864583  |        0.708333  |
+--------------+----------------+------------------+


In [201]:
# Subset numerical columns
numerical_columns = second_dataset.select_dtypes(include='number')

# Remove 'Difference', 'Increase', and 'Decrease' from numerical columns
columns_to_exclude = ['Difference', 'Increase', 'Decrease']
numerical_columns_subset = numerical_columns.drop(columns_to_exclude, axis=1)

# Fill missing values with the mean in numerical columns
numerical_columns_subset.fillna(numerical_columns_subset.mean(), inplace=True)

# Fill missing values with the mean in 'Difference'
difference_mean = second_dataset['Difference'].mean()
second_dataset['Difference'].fillna(difference_mean, inplace=True)

# Define the number of features to select
k = 61

# Perform univariate selection using f_regression
selector = SelectKBest(score_func=f_regression, k=k)
selected_features = selector.fit_transform(numerical_columns_subset, second_dataset['Difference'])

# Get the selected feature indices
selected_indices = selector.get_support(indices=True)

# Get the selected feature names
selected_features_names = numerical_columns_subset.columns[selected_indices]

# Get the feature scores
feature_scores = selector.scores_[selected_indices]

# Standardize the feature scores
scaler = StandardScaler()
standardized_scores = scaler.fit_transform(feature_scores.reshape(-1, 1))

# Combine selected feature names and their standardized scores
selected_features_with_scores = list(zip(selected_features_names, standardized_scores))

# Sort the selected features by the absolute value of standardized scores in descending order
selected_features_with_scores.sort(key=lambda x: abs(x[1]), reverse=True)

# Print the selected features and their standardized relevance scores in descending order
print("Selected Features and Absolute Standardized Relevance Scores (Descending Order):")
for feature, score in selected_features_with_scores:
    print(f"{feature}: {score[0]}")

Selected Features and Absolute Standardized Relevance Scores (Descending Order):
Short-Term Treasury Diff: 7.704887620577532
LAG_RollingMean: 0.322017383010566
LAG: 0.2851747920525605
Consumer Sentiment: -0.2218933648952916
improve: -0.21983855247856168
boosted: -0.21941158295480928
Sentiment Label: -0.2174058056107605
Proportion Positive Words: -0.21300782176354285
strengthen: -0.21111486549230846
strong: -0.20346416882301052
Proportion Negative Words: -0.20211728782651403
improving: -0.19996993106595012
stability: -0.19586754455493005
Housing Sales: -0.19330671846947425
better: -0.19134663781775868
gain: -0.19122230508986315
stable: -0.1906684690938487
despite: -0.19055113984333438
favored: -0.18896703183441366
strength: -0.1886219593736572
rebounded: -0.18830492285820247
strengthened: -0.18693044827251917
desirable: -0.18607437575948096
Nonfarm Payroll: -0.1815852920709761
Long-Term Treasury Bond Rate: -0.17899375689541397
desired: -0.17825792908437976
boost: -0.17712798400302385
Le