In [1]:
from train import *
from processing import find_estado_with_value_two
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
import numpy as np
import pandas as pd
import xgboost as xgb
import random
import shap
pd.set_option('display.max_columns', None)
random.seed(10)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
df = build_dataset_with_features(categorical_features=False)


In [3]:
train, val, test = train_test_val_split(df)
x_test, y_test = split_into_x_y(test)
x_val, y_val = split_into_x_y(val)

In [4]:
def reduce_training_set(train_df, frac):
    df_train_reduced = train_df.sample(frac= frac, random_state=42)
    x_train, y_train = split_into_x_y(df_train_reduced)
    return x_train, y_train

def get_metrics_train_val(model, name, x_train, y_train, print_metrics=True):
    y_pred_train = model.predict(x_train)
    y_pred_val = model.predict(x_val)
    
    train_mse = round(mean_squared_error(y_train, y_pred_train), 2)
    val_mse = round(mean_squared_error(y_val, y_pred_val), 2)
    val_mse_discrete = round(mse_score_discrete(y_val, y_pred_val), 2)

    if print_metrics:
        print('Train MSE {}: {}'.format(name, train_mse))
        print('Validation MSE {}: {}'.format(name, val_mse))
        print('Validation MSE Discrete {}: {}'.format(name, val_mse_discrete))

    return train_mse, val_mse, val_mse_discrete, model

def run_random_forest_model(x_train, y_train,n_trees_grid):
    metrics_dict = {}
    for n_trees in n_trees_grid:
        rf = RandomForestRegressor(n_estimators=n_trees)
        rf.fit(x_train, y_train)
        metrics_dict['Random Forest (n_trees = {n_trees})'] = get_metrics_train_val(rf,f"Random Forest (n_trees = {n_trees})", x_train, y_train)

    return metrics_dict

In [5]:
n_trees_grid = [5,10,25,50,100]

x_train, y_train = reduce_training_set(train, 0.25)

run_random_forest_model(x_train, y_train, n_trees_grid)


Train MSE Random Forest (n_trees = 5): 0.06
Validation MSE Random Forest (n_trees = 5): 0.3
Validation MSE Discrete Random Forest (n_trees = 5): 0.37
Train MSE Random Forest (n_trees = 10): 0.04
Validation MSE Random Forest (n_trees = 10): 0.29
Validation MSE Discrete Random Forest (n_trees = 10): 0.37
Train MSE Random Forest (n_trees = 25): 0.03
Validation MSE Random Forest (n_trees = 25): 0.26
Validation MSE Discrete Random Forest (n_trees = 25): 0.32
Train MSE Random Forest (n_trees = 50): 0.03
Validation MSE Random Forest (n_trees = 50): 0.25
Validation MSE Discrete Random Forest (n_trees = 50): 0.32
Train MSE Random Forest (n_trees = 100): 0.03
Validation MSE Random Forest (n_trees = 100): 0.25
Validation MSE Discrete Random Forest (n_trees = 100): 0.31


{'Random Forest (n_trees = {n_trees})': (0.03,
  0.25,
  0.31,
  RandomForestRegressor())}

In [6]:
n_trees_grid = [5,10,25,50]

x_train, y_train = reduce_training_set(train, 0.5)

run_random_forest_model(x_train, y_train, n_trees_grid)


Train MSE Random Forest (n_trees = 5): 0.05
Validation MSE Random Forest (n_trees = 5): 0.29
Validation MSE Discrete Random Forest (n_trees = 5): 0.35
Train MSE Random Forest (n_trees = 10): 0.04
Validation MSE Random Forest (n_trees = 10): 0.26
Validation MSE Discrete Random Forest (n_trees = 10): 0.33
Train MSE Random Forest (n_trees = 25): 0.03
Validation MSE Random Forest (n_trees = 25): 0.25
Validation MSE Discrete Random Forest (n_trees = 25): 0.32
Train MSE Random Forest (n_trees = 50): 0.03
Validation MSE Random Forest (n_trees = 50): 0.24
Validation MSE Discrete Random Forest (n_trees = 50): 0.31


{'Random Forest (n_trees = {n_trees})': (0.03,
  0.24,
  0.31,
  RandomForestRegressor(n_estimators=50))}

In [7]:
n_trees_grid = [5,10,25,50]

x_train, y_train = reduce_training_set(train, 0.5)

rf = RandomForestRegressor(n_estimators=50).fit(x_train, y_train)

In [8]:
y_pred = rf.predict(x_val)

def df_both_metrics(y_pred, y_test):
    y_pred_rounded = []

    for pred in y_pred:
        if pred - int(pred) < 0.5:
            y_pred_rounded.append(max(0, math.floor(pred)))
        else:
            y_pred_rounded.append(max(0, math.ceil(pred)))
    
    return pd.DataFrame({'y_pred': y_pred_rounded, 'y_test': y_val})
                        
df_metrics_val = df_both_metrics(y_pred, y_test)
df_metrics_val

Unnamed: 0,y_pred,y_test
45,1,0.0
46,1,1.0
47,1,1.0
48,2,3.0
49,2,4.0
...,...,...
85726,1,1.0
85727,0,0.0
85728,0,0.0
85729,0,0.0


In [9]:
import pandas as pd
from sklearn.metrics import mean_squared_error, accuracy_score

def get_metrics_y_relative(df):

    metrics_df = pd.DataFrame(columns=['Y(t+1)', 'MSE', 'Accuracy'])

    for class_label in sorted(df['y_test'].unique()):
        df_class = df[df['y_test'] == class_label]

        mse = mean_squared_error(df_class['y_test'], df_class['y_pred'])
        accuracy = accuracy_score(df_class['y_test'], df_class['y_pred'])

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            'Y(t+1)': [class_label],
            'MSE': [mse],
            'Accuracy': [accuracy]
        })])
    return metrics_df.reset_index(drop=True)

# Example usage:  # Replace with your actual DataFrame
metrics_df = get_metrics_y_relative(df_metrics_val)
metrics_df


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Unnamed: 0,Y(t+1),MSE,Accuracy
0,0.0,0.228281,0.812615
1,1.0,0.353465,0.646535
2,2.0,0.280911,0.747025
3,3.0,1.387097,0.008798
4,4.0,5.025,0.0


In [10]:
x_val["estado_actual"] = x_val[estados].apply(find_estado_with_value_two, axis=1)
df_metrics_val["estado_actual"] = x_val["estado_actual"]
def get_metrics_by_estado(df):

    metrics_df = pd.DataFrame(columns=['Estado', 'MSE', 'Accuracy'])

    for class_label in sorted(df['estado_actual'].unique()):
        df_class = df[df['estado_actual'] == class_label]

        mse = mean_squared_error(df_class['y_test'], df_class['y_pred'])
        accuracy = accuracy_score(df_class['y_test'], df_class['y_pred'])

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            'Estado': [class_label],
            'MSE': [mse],
            'Accuracy': [accuracy]
        })])
    return metrics_df.reset_index(drop=True)

metrics_df_by_estado = get_metrics_by_estado(df_metrics_val)
metrics_df_by_estado

The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.


Unnamed: 0,Estado,MSE,Accuracy
0,1,0.434599,0.590717
1,2,0.607898,0.537377
2,3,0.559633,0.538663
3,4,0.796415,0.432778
4,5,0.756997,0.440204
5,6,0.642518,0.492874
6,7,0.394737,0.62782
7,8,0.052851,0.947149
8,9,0.262637,0.737363
9,10,0.116698,0.891514


In [11]:
x_val["estado_actual"].value_counts()

estado_actual
10    6941
9     2928
11    1028
6      842
7      798
5      786
4      781
3      763
8      719
2      709
1      237
12     180
13       7
14       4
Name: count, dtype: int64