In [50]:
from train import *
from processing import find_estado_with_value_two
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import shap
pd.set_option('display.max_columns', None)
random.seed(10)


In [51]:
df = build_dataset_with_features(categorical_features=False)


In [52]:
train, val, test = train_test_val_split(df)
x_test, y_test = split_into_x_y(test)
x_val, y_val = split_into_x_y(val)

In [53]:
def reduce_training_set(train_df, frac):
    df_train_reduced = train_df.sample(frac= frac, random_state=42)
    x_train, y_train = split_into_x_y(df_train_reduced)
    return x_train, y_train

def get_metrics_train_val(model, name, x_train, y_train, print_metrics=True):
    y_pred_train = model.predict(x_train)
    y_pred_val = model.predict(x_val)
    
    train_mse = round(mean_squared_error(y_train, y_pred_train), 2)
    val_mse = round(mean_squared_error(y_val, y_pred_val), 2)
    val_mse_discrete = round(mse_score_discrete(y_val, y_pred_val), 2)

    if print_metrics:
        print('Train MSE {}: {}'.format(name, train_mse))
        print('Validation MSE {}: {}'.format(name, val_mse))
        print('Validation MSE Discrete {}: {}'.format(name, val_mse_discrete))

    return train_mse, val_mse, val_mse_discrete, model

def run_random_forest_model(x_train, y_train,n_trees_grid):
    metrics_dict = {}
    for n_trees in n_trees_grid:
        rf = RandomForestRegressor(n_estimators=n_trees)
        rf.fit(x_train, y_train)
        metrics_dict['Random Forest (n_trees = {n_trees})'] = get_metrics_train_val(rf,f"Random Forest (n_trees = {n_trees})", x_train, y_train)

    return metrics_dict

### Effects of reducing the dataset on the predictive power of the model

In [54]:
n_trees_grid = [5,10,25,50,100]

x_train, y_train = reduce_training_set(train, 0.25)

run_random_forest_model(x_train, y_train, n_trees_grid)


Train MSE Random Forest (n_trees = 5): 0.06
Validation MSE Random Forest (n_trees = 5): 0.32
Validation MSE Discrete Random Forest (n_trees = 5): 0.37
Train MSE Random Forest (n_trees = 10): 0.04
Validation MSE Random Forest (n_trees = 10): 0.28
Validation MSE Discrete Random Forest (n_trees = 10): 0.36
Train MSE Random Forest (n_trees = 25): 0.03
Validation MSE Random Forest (n_trees = 25): 0.25
Validation MSE Discrete Random Forest (n_trees = 25): 0.31
Train MSE Random Forest (n_trees = 50): 0.03
Validation MSE Random Forest (n_trees = 50): 0.25
Validation MSE Discrete Random Forest (n_trees = 50): 0.32
Train MSE Random Forest (n_trees = 100): 0.03
Validation MSE Random Forest (n_trees = 100): 0.25
Validation MSE Discrete Random Forest (n_trees = 100): 0.32


{'Random Forest (n_trees = {n_trees})': (0.03,
  0.25,
  0.32,
  RandomForestRegressor())}

In [55]:
n_trees_grid = [5,10,25,50]

x_train, y_train = reduce_training_set(train, 0.5)

run_random_forest_model(x_train, y_train, n_trees_grid)


Train MSE Random Forest (n_trees = 5): 0.05
Validation MSE Random Forest (n_trees = 5): 0.3
Validation MSE Discrete Random Forest (n_trees = 5): 0.36
Train MSE Random Forest (n_trees = 10): 0.04
Validation MSE Random Forest (n_trees = 10): 0.27
Validation MSE Discrete Random Forest (n_trees = 10): 0.34
Train MSE Random Forest (n_trees = 25): 0.03
Validation MSE Random Forest (n_trees = 25): 0.25
Validation MSE Discrete Random Forest (n_trees = 25): 0.31
Train MSE Random Forest (n_trees = 50): 0.03
Validation MSE Random Forest (n_trees = 50): 0.24
Validation MSE Discrete Random Forest (n_trees = 50): 0.31


{'Random Forest (n_trees = {n_trees})': (0.03,
  0.24,
  0.31,
  RandomForestRegressor(n_estimators=50))}

### Metrics for the best performing model

In [56]:
n_trees_grid = [5,10,25,50]

x_train, y_train = reduce_training_set(train, 0.5)

rf = RandomForestRegressor(n_estimators=50).fit(x_train, y_train)

In [57]:
y_pred = rf.predict(x_val)

In [59]:
x_val["estado_actual"] = x_val[estados].apply(find_estado_with_value_two, axis=1)
train["estado_actual"] = train[estados].apply(find_estado_with_value_two, axis=1)
x_train["estado_actual"] = x_train[estados].apply(find_estado_with_value_two, axis=1)

mode_dict = train.groupby('estado_actual')['next_y'].apply(lambda x: x.mode().iloc[0]).to_dict()
y_pred_baseline = x_val['estado_actual'].map(mode_dict)
y_pred_baseline_train = x_train['estado_actual'].map(mode_dict)

In [60]:
print('Train MSE baseline: {}'.format(metrics.mean_squared_error(y_train, y_pred_baseline_train)))
print('Validation MSE baseline: {}'.format(metrics.mean_squared_error(y_val, y_pred_baseline)))
print('Validation MSE discrete baseline: {}'.format(mse_score_discrete(y_val, y_pred_baseline)))

Train MSE baseline: 0.41027347128884156
Validation MSE baseline: 0.3912575494827483
Validation MSE discrete baseline: 0.3912575494827483


In [61]:
def df_both_metrics(y_pred, y_test):
    y_pred_rounded = []

    for pred in y_pred:
        if pred - int(pred) < 0.5:
            y_pred_rounded.append(max(0, math.floor(pred)))
        else:
            y_pred_rounded.append(max(0, math.ceil(pred)))
    
    return pd.DataFrame({'y_pred': y_pred_rounded, 'y_test': y_test, 'y_val_baseline': y_pred_baseline})
                        
df_metrics_val = df_both_metrics(y_pred, y_test)
df_metrics_val

ValueError: array length 16723 does not match index length 24890

In [None]:
from sklearn.metrics import mean_squared_error, accuracy_score

def get_metrics_y_relative(df):
    metrics_df = pd.DataFrame(columns=['Y(t+1)', 'MSE_Model', 'Accuracy_Model', 'MSE_Baseline', 'Accuracy_Baseline'])

    for class_label in sorted(df['y_test'].unique()):
        df_class = df[df['y_test'] == class_label]

        mse_actual = mean_squared_error(df_class['y_test'], df_class['y_pred'])
        accuracy_actual = accuracy_score(df_class['y_test'], df_class['y_pred'])

        mse_baseline = mean_squared_error(df_class['y_test'], df_class['y_val_baseline'])
        accuracy_baseline = accuracy_score(df_class['y_test'], df_class['y_val_baseline'])

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            'Y(t+1)': [class_label],
            'MSE_Model': [mse_actual],
            'Accuracy_Model': [accuracy_actual],
            'MSE_Baseline': [mse_baseline],
            'Accuracy_Baseline': [accuracy_baseline]
        })])

    return metrics_df.reset_index(drop=True).round(2)

# Example usage:
metrics_df_relative = get_metrics_y_relative(df_metrics_val)
metrics_df_relative


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Unnamed: 0,Y(t+1),MSE_Model,Accuracy_Model,MSE_Baseline,Accuracy_Baseline
0,0.0,0.22,0.82,0.2,0.86
1,1.0,0.36,0.64,0.63,0.37
2,2.0,0.31,0.72,0.31,0.77
3,3.0,1.28,0.03,1.7,0.0
4,4.0,4.62,0.0,6.18,0.0


In [None]:
df_metrics_val['estado_actual'] = x_val['estado_actual']

def get_metrics_by_estado(df, baseline_predictions):
    metrics_df = pd.DataFrame(columns=['Estado', 'MSE_Actual', 'Accuracy_Actual', 'MSE_Baseline', 'Accuracy_Baseline'])

    for class_label in sorted(df['estado_actual'].unique()):
        df_class = df[df['estado_actual'] == class_label]

        mse_actual = mean_squared_error(df_class['y_test'], df_class['y_pred'])
        accuracy_actual = accuracy_score(df_class['y_test'], df_class['y_pred'])

        mse_baseline = mean_squared_error(df_class['y_test'], baseline_predictions[df_class.index])
        accuracy_baseline = accuracy_score(df_class['y_test'], baseline_predictions[df_class.index])

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            'Estado': [class_label],
            'MSE_Actual': [mse_actual],
            'Accuracy_Actual': [accuracy_actual],
            'MSE_Baseline': [mse_baseline],
            'Accuracy_Baseline': [accuracy_baseline],
        })])

    return metrics_df.reset_index(drop=True).round(2)

# Assuming df_metrics_val is your DataFrame with actual predictions and y_pred_baseline is the baseline prediction
metrics_df_by_estado = get_metrics_by_estado(df_metrics_val, y_pred_baseline)
estado_counts = x_val["estado_actual"].value_counts()
metrics_df_by_estado['Sample_Size'] = metrics_df_by_estado['Estado'].map(estado_counts)

metrics_df_by_estado

The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Unnamed: 0,Estado,MSE_Actual,Accuracy_Actual,MSE_Baseline,Accuracy_Baseline,Sample_Size
0,1,0.4,0.61,0.39,0.62,237
1,2,0.58,0.54,0.72,0.5,709
2,3,0.54,0.53,0.7,0.53,763
3,4,0.77,0.45,0.93,0.36,781
4,5,0.79,0.39,0.85,0.4,786
5,6,0.63,0.48,0.75,0.45,842
6,7,0.4,0.62,0.43,0.64,798
7,8,0.05,0.95,0.05,0.95,719
8,9,0.26,0.74,0.38,0.62,2928
9,10,0.12,0.89,0.16,0.87,6941
