In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.evaluation.evaluation import calculate_metrics, export_model, save_graph_feature_importance
from sklearn.metrics import mean_squared_error
from sklearn import tree, ensemble
import xgboost
import catboost as cb
from sklearn.model_selection import GridSearchCV, cross_val_predict, GroupKFold
import numpy as np
import pickle
import json
import os
from src.modelization.models_utils import get_pipeline
from src.constants import BASE_PATH_EXPERIMENTS, PATH_EVALUATION_DF_WITH_METRICS_CSV, PATH_EVALUATION_CSV, PATH_TRAIN, PATH_TEST
from datetime import datetime
pd.options.display.float_format = '{:.2f}'.format
import warnings
import zipfile
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

warnings.filterwarnings("ignore")


In [2]:
#Read Data
df_train = pd.read_csv(PATH_TRAIN)
df_test = pd.read_csv(PATH_TEST)

In [3]:
# columns_to_drop=['geometry', 'barrio_id', 'barrio']
# df_train = df_train.drop(columns=columns_to_drop)
# df_test = df_test.drop(columns=columns_to_drop)
df_test["precio_logaritmico_mean_barrio"]=np.log(df_test['precio_mean_barrio'])

# Evaluate with different targets: precio, precio_unitario_m2, precio_logaritmico
possible_targets= ["precio", "precio_unitario_m2", "precio_logaritmico"]

In [5]:
# Initialize empty dataframes
evaluation_df_with_metrics = pd.DataFrame()
evaluation = pd.DataFrame()

for target in possible_targets:
    baseline_model = f'Precio medio por barrio - {target}'

    # Calculate metrics based on the current target
    if target == 'precio':
        metrics_df, df_test_with_metrics = calculate_metrics(df_test['precio'], df_test['precio_mean_barrio'], df_test, model_name=baseline_model, target=target)
    elif target == 'precio_unitario_m2':
        metrics_df, df_test_with_metrics = calculate_metrics(df_test['precio_unitario_m2'], df_test['precio_unitario_m2_mean_barrio'], df_test, model_name=baseline_model, target=target)
    else:
        metrics_df, df_test_with_metrics = calculate_metrics(df_test['precio_logaritmico'], df_test['precio_logaritmico_mean_barrio'], df_test, model_name=baseline_model, target=target)

    # Append results to the evaluation dataframes
    df_test_with_metrics['model_name'] = baseline_model
    evaluation_df_with_metrics = pd.concat([evaluation_df_with_metrics, df_test_with_metrics], ignore_index=True)
    evaluation = pd.concat([evaluation, metrics_df], ignore_index=True)




precio
precio_unitario_m2
precio_logaritmico


In [7]:
X_train = df_train.drop(columns=['precio', 'precio_unitario_m2', "precio_logaritmico"])
y_train = df_train[target]
X_test = df_test.drop(columns=['precio', 'precio_unitario_m2', "precio_logaritmico"])
y_test = df_test[target]

In [55]:
model_dt = tree.DecisionTreeRegressor()
model_rf = ensemble.RandomForestRegressor()
model_gb = ensemble.GradientBoostingRegressor()
model_xgb = xgboost.XGBRegressor()

models = {
    'Decision Tree': {
        'model': model_dt,
        'param_grid': {
            'model__max_depth': [5, 10, 20],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'Random Forest': {
        'model': model_rf,
        'param_grid': {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [5, 10, 20],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    },
    'Gradient Boost': {
        'model': model_gb,
        'param_grid': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 10]
        }
    },
    'eXtreme Gradient Boost': {
        'model': model_xgb,
        'param_grid': {
            'model__n_estimators': [50, 100, 200],
            'model__learning_rate': [0.01, 0.1, 0.2],
            'model__max_depth': [3, 5, 10]
        }
    }
}

In [44]:
# Create pipeline
pipeline = get_pipeline(
    base_model=model,
    impute=True,  
    scale=True,  
    encode=True,
    num_features=X_train.columns.to_list()
)

In [51]:
hyper_tunning={
                "Grid_SearchCV": GridSearchCV(
                    estimator=pipeline,
                    param_grid=param_grid,
                    scoring='neg_mean_squared_error',
                    cv=5
                    ),
                "Randomized_SearchCV" : 
                     RandomizedSearchCV(estimator=pipeline, 
                                          param_distributions=param_random,
                                          random_state=42, 
                                          verbose= 1)
               #  "Bayesian_SearchCV": BayesSearchCV(estimator=pipeline,
               #     search_spaces=param_space,
               #     random_state=42
               #     ),
                       }

In [52]:
# Iterate through hyper_tunning dictionary
for method_name, search_cv in hyper_tunning.items():
    print(f"Performing {method_name}...")

    # Perform hyperparameter tuning
    search_cv.fit(X_train, y_train)

    # Best model after hyperparameter tuning
    best_model = search_cv.best_estimator_

    # Perform cross-validation predictions with the best model
    y_pred = cross_val_predict(best_model, X_train, y_train, cv=5)

Performing Randomized_SearchCV...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [56]:
# Calculate metrics
metrics_df, df_test_with_metrics = calculate_metrics(y_train, y_pred, X_train, model_name, target)

# Add model_name column to df_test_with_metrics
df_test_with_metrics['model_name'] = model_name
df_test_with_metrics['target'] = target
df_test_with_metrics['model_folder'] = f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d')}"

# Append df_test_with_metrics to all_df_test_with_metrics
evaluation_df_with_metrics = pd.concat([evaluation_df_with_metrics, df_test_with_metrics], ignore_index=True)

# Append metrics_df to all_metrics_df
evaluation = pd.concat([evaluation, metrics_df], ignore_index=True)

In [62]:
# barrio = df_train['barrio'].values
# group_kfold = GroupKFold(n_splits=5) 
# city_kfold = group_kfold.split(X_train, y_train, barrio)  
# train_indices, test_indices = [list(traintest) for traintest in zip(*city_kfold)]
# city_cv = [*zip(train_indices, test_indices)]
# predictions = cross_val_predict(model, X_train, y_train, cv=city_cv)


In [57]:
evaluation

Unnamed: 0,Model,Target,RMSE - Root Mean Squared Error,MAPE - Mean Absolute Percentage Error,R2 - Coefficient of Determination,ME - Mean Error,MED - Median Error,MAE - Mean Absolute Error,MAED - Median Absolute Error,MAPED - Median Absolute Percentage Error,Percentage error lower_5,Percentage error lower_10,Percentage error lower_25,Standard Deviation of Errors,Confidence Interval Lower,Confidence Interval Upper,Model Folder
0,Precio medio por barrio - precio,precio,319575,50.16,0.4,-1172,-30483,171293,84913,33.0,8.38,16.8,39.46,319572.42,-496032.06,745708.33,experiment_Precio medio por barrio - precio_20...
1,Precio medio por barrio - precio_unitario_m2,precio_unitario_m2,955,21.32,0.68,-12,-85,689,505,15.0,17.63,34.12,70.58,955.4,-1787.63,2175.01,experiment_Precio medio por barrio - precio_un...
2,Precio medio por barrio - precio_logaritmico,precio_logaritmico,1,3.2,0.54,0,0,0,0,3.0,79.29,97.66,100.0,0.5,-1.15,0.85,experiment_Precio medio por barrio - precio_lo...
3,Random Forest,precio_logaritmico,0,1.02,0.94,0,0,0,0,1.0,98.86,99.9,100.0,0.18,-0.39,0.35,experiment_Random Forest_20240307-203750


In [None]:
# Save the DataFrames to CSV files
evaluation.to_csv(PATH_EVALUATION_CSV, index=False)
evaluation_df_with_metrics.to_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV, index=False)

In [None]:
# models_to_test = {
#                   f'Decision Tree wo CV {target}' : model_dt, 
#                   f'RandomForest wo CV {target}' : model_rf, 
#                   f'Gradient Boosting wo CV {target}': model_gb, 
#                   f'eXtreme Gradient Boost wo CV {target}':model_xgb,
#                   f'CatBoost wo CV {target}': model_cb
#                 }

In [None]:
# from datetime import datetime
# import pandas as pd

# for model_name, model in models_to_test.items():
#     # Create pipeline
#     pipeline = get_pipeline(
#         base_model=model,
#         impute=True,  
#         scale=True,  
#         encode=True,
#         num_features=X_train.columns.to_list()
#     )
    
#     # Fit the pipeline and make predictions
#     pipeline.fit(X_train, y_train)
    
#     # Perform cross-validation predictions
#     y_pred = cross_val_predict(pipeline, X_train, y_train, cv=5)
    
#     # Export Model
#     output_folder = export_model(
#         model=pipeline,
#         X_train=X_train,
#         y_train=y_train,
#         base_path=BASE_PATH_EXPERIMENTS,
#         save_model=True,     
#         save_datasets=True,  
#         zip_files=True      
#     )

#     # Calculate metrics
#     metrics_df, df_test_with_metrics = calculate_metrics(y_train, y_pred, X_train, model_name)

#     # Add model_name column to df_test_with_metrics
#     df_test_with_metrics['model_name'] = model_name
#     df_test_with_metrics['model_folder'] = f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"

#     # Append df_test_with_metrics to all_df_test_with_metrics
#     evaluation_df_with_metrics = pd.concat([evaluation_df_with_metrics, df_test_with_metrics], ignore_index=True)

#     # Append metrics_df to all_metrics_df
#     evaluation = pd.concat([evaluation, metrics_df], ignore_index=True)
    
# # Save the DataFrames to CSV files
# evaluation.to_csv(PATH_EVALUATION_CSV, index=False)
# evaluation_df_with_metrics.to_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV, index=False)


In [None]:
# Decision Tree Regressor (model_dt):
# max_depth: Typically ranges from 1 to 32.
# min_samples_split: Typically ranges from 2 to 20.
# min_samples_leaf: Typically ranges from 1 to 10.
# max_features: Typically ranges from 1 to the number of features.

params_dt = {"max_depth": [5, 10, 15], 
             "min_samples_split" : [4, 6, 10 ],
             "max_features": [15,20]
             }

# Random Forest Regressor (model_rf):
# n_estimators: Typically ranges from 50 to 1000.
# max_depth: Typically ranges from 1 to 32.
# min_samples_split: Typically ranges from 2 to 20.
# min_samples_leaf: Typically ranges from 1 to 10.

params_rf = {'n_estimators': [75, 200, 500] ,
          'max_depth' : [5,10] ,
          'min_samples_split' : [4, 6, 8],
          }


# Gradient Boosting Regressor (model_gb):
# n_estimators: Typically ranges from 50 to 1000.
# learning_rate: Typically ranges from 0.01 to 0.1.
# max_depth: Typically ranges from 1 to 10.
# min_samples_split: Typically ranges from 2 to 20.

params_gb = {'n_estimators':[75, 150, 200, 500],
             'learning_rate' : [0.05, 0.1, 0.15],
             'max_depth' : [5, 10, 15],
             'min_samples_split' : [4, 6, 8],
             }

# XGBoost Regressor (model_xgb):
# n_estimators: Typically ranges from 50 to 1000.
# learning_rate: Typically ranges from 0.01 to 0.1.
# max_depth: Typically ranges from 1 to 10.
# min_child_weight: Typically ranges from 1 to 10.

params_xgb = {'n_estimators': [75, 150, 200, 500],
              'learning_rate' : [0.05, 0.1, 0.15],
              'max_depth' : [5, 10, 15],
              'min_child_weight' : [2,3,5]}

# CatBoost Regressor (model_cb):
# n_estimators: Typically ranges from 50 to 1000.
# learning_rate: Typically ranges from 0.01 to 0.1.
# max_depth: Typically ranges from 1 to 10.
# l2_leaf_reg: Typically ranges from 1 to 10.



In [None]:
# Define a function to perform grid search and save results
folder_names_params = {
    'experiment_DecisionTreeRegressor_20240303-151554': params_dt,
    'experiment_RandomForestRegressor_20240303-151806': params_rf,
    'experiment_GradientBoostingRegressor_20240303-151845': params_gb,
    'experiment_XGBRegressor_20240303-151849': params_xgb,
}

for folder, param_grid in folder_names_params.items():
    # Path to the zip file
    zip_file_path = f'src/evaluation/{folder}/model.zip'
    
    # Name of the file within the zip folder
    file_name_within_zip = 'model.pkl'

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract the file from the zip folder
        with zip_ref.open(file_name_within_zip) as file:
            # Read the file using pandas
            pipeline = pickle.load(file)
    
    # Grid search for best parameters
    gs = GridSearchCV(estimator=pipeline,
                      param_grid=param_grid,
                      scoring='neg_root_mean_squared_error',
                    #   CV=5
                      )
    gs.fit(X_train, y_train)
    
    # Store the grid search results for the current model and folder
    grid_search_info = {
        'best_params': gs.best_params_,
        'best_score': gs.best_score_,
        'cv_results': gs.cv_results_,
    }

    # Save the grid search information to a JSON file within the folder
    grid_search_output_file = os.path.join(f'src/evaluation/{folder}', 'grid_search_info.json')
    with open(grid_search_output_file, 'w') as f:
        json.dump(grid_search_info, f)

    # Save in the model folder the graph with feature importance
    save_graph_feature_importance(model=model, X_train=X_train, folder=folder)

print("Grid search information saved successfully. Feature Importance graph saved successfully.")



In [None]:
# #To use CatBoost is needed to transform data into a Pool Object
# train_dataset = cb.Pool(X_train, y_train) 
# test_dataset = cb.Pool(X_test, y_test)
# model_cb = cb.CatBoostRegressor()

In [None]:
# param_space = {
#     'model__n_estimators': Integer(50, 200),           # Number of trees in the forest
#     'model__max_depth': Integer(3, 20),                # Maximum depth of the tree
#     'model__min_samples_split': Integer(2, 10),        # Minimum number of samples required to split an internal node
#     'model__min_samples_leaf': Integer(1, 4)           # Minimum number of samples required to be at a leaf node
# }

# param_random = {
#     'model__n_estimators': randint(50, 200),           # Number of trees in the forest
#     'model__max_depth': randint(3, 20),                # Maximum depth of the tree
#     'model__min_samples_split': randint(2, 10),        # Minimum number of samples required to split an internal node
#     'model__min_samples_leaf': randint(1, 4)           # Minimum number of samples required to be at a leaf node
# }



In [None]:
if  os.path.exists(PATH_EVALUATION_DF_WITH_METRICS_CSV):
    evaluation_df_with_metrics = pd.read_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV)
if  os.path.exists(PATH_EVALUATION_CSV):
    evaluation = pd.read_csv(PATH_EVALUATION_CSV)

# Check if the CSV files exist, and save dataframes accordingly
if not os.path.exists(PATH_EVALUATION_DF_WITH_METRICS_CSV):
    evaluation_df_with_metrics.to_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV, index=False)
else:
    existing_df_with_metrics = pd.read_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV)
    evaluation_df_with_metrics = pd.concat([existing_df_with_metrics, evaluation_df_with_metrics], ignore_index=True)
    evaluation_df_with_metrics.to_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV, index=False)

if not os.path.exists(PATH_EVALUATION_CSV):
    evaluation.to_csv(PATH_EVALUATION_CSV, index=False)
else:
    existing_evaluation = pd.read_csv(PATH_EVALUATION_CSV)
    evaluation = pd.concat([existing_evaluation, evaluation], ignore_index=True)
    evaluation.to_csv(PATH_EVALUATION_CSV, index=False)