In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.evaluation.evaluation import calculate_metrics, export_model, save_graph_feature_importance
from sklearn.metrics import mean_squared_error
from sklearn import tree, ensemble
import xgboost
import catboost as cb
from sklearn.model_selection import GridSearchCV
import numpy as np
import pickle
import json
import os
from src.modelization.models_utils import get_pipeline
from src.constants import BASE_PATH_EXPERIMENTS, PATH_EVALUATION_DF_WITH_METRICS_CSV, PATH_EVALUATION_CSV, PATH_TRAIN, PATH_TEST
from datetime import datetime
pd.options.display.float_format = '{:.2f}'.format
import warnings
import zipfile


warnings.filterwarnings("ignore")



In [None]:
#Read Data
df_train = pd.read_csv(PATH_TRAIN)
df_test = pd.read_csv(PATH_TEST)
if  os.path.exists(PATH_EVALUATION_DF_WITH_METRICS_CSV):
    evaluation_df_with_metrics = pd.read_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV)
if  os.path.exists(PATH_EVALUATION_CSV):
    evaluation = pd.read_csv(PATH_EVALUATION_CSV)

In [None]:
columns_to_drop=['geometry', 'barrio_id', 'barrio']
df_train = df_train.drop(columns=columns_to_drop)
df_test = df_test.drop(columns=columns_to_drop)

# Evaluate with different targets: precio, precio_unitario_m2, precio_logaritmico
target = "precio_unitario_m2"

In [None]:
baseline_model = 'Precio medio por barrio - precio unitario'
if target == 'precio':
    metrics_df, df_test_with_metrics = calculate_metrics(df_test['precio'], df_test['precio_mean_barrio'], df_test, model_name=baseline_model)
else:
    metrics_df, df_test_with_metrics = calculate_metrics(df_test['precio_unitario_m2'], df_test['precio_unitario_m2_mean_barrio'], df_test, model_name=baseline_model)

df_test_with_metrics['model_name'] = baseline_model

if not os.path.exists(PATH_EVALUATION_DF_WITH_METRICS_CSV):
    evaluation_df_with_metrics=pd.DataFrame()
    evaluation_df_with_metrics = pd.concat([evaluation_df_with_metrics, df_test_with_metrics], ignore_index=True)
    evaluation_df_with_metrics.to_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV, index=False)

if not os.path.exists(PATH_EVALUATION_CSV):
    evaluation=pd.DataFrame()
    evaluation =pd.concat([evaluation, metrics_df], ignore_index=True)
    evaluation.to_csv(PATH_EVALUATION_CSV, index=False)

In [None]:
X_train = df_train.drop(columns=['precio', 'precio_unitario_m2', "precio_logaritmico"])
y_train = df_train[target]
X_test = df_test.drop(columns=['precio', 'precio_unitario_m2', "precio_logaritmico"])
y_test = df_test[target]

In [None]:
model_dt = tree.DecisionTreeRegressor()
model_rf = ensemble.RandomForestRegressor()
model_gb = ensemble.GradientBoostingRegressor()
model_xgb = xgboost.XGBRegressor()

#To use CatBoost is needed to transform data into a Pool Object
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_test, y_test)
model_cb = cb.CatBoostRegressor()


In [None]:
models_to_test = {
                  f'Decision Tree wo CV {target}' : model_dt, 
                  f'RandomForest wo CV {target}' : model_rf, 
                  f'Gradient Boosting wo CV {target}': model_gb, 
                  f'eXtreme Gradient Boost wo CV {target}':model_xgb,
                  f'CatBoost wo CV {target}': model_cb
                }

In [None]:
for model_name, model in models_to_test.items():
    # Create pipeline
    pipeline = get_pipeline(
        base_model=model,
        impute=True,  
        scale=True,  
        encode=True,
        num_features=X_train.columns.to_list()
    )
        
    # Fit the pipeline and make predictions
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    #Export Model
    output_folder = export_model(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    base_path=BASE_PATH_EXPERIMENTS,
    save_model=True,     
    save_datasets=True,  
    zip_files=True      
    )

    # Calculate metrics
    metrics_df, df_test_with_metrics = calculate_metrics(y_test, y_pred, df_test, model_name)

    # Add model_name column to df_test_with_metrics
    df_test_with_metrics['model_name'] = model_name
    df_test_with_metrics['model_folder'] = f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"

    # Append df_test_with_metrics to all_df_test_with_metrics
    evaluation_df_with_metrics = pd.concat([evaluation_df_with_metrics, df_test_with_metrics], ignore_index=True)

    # Append metrics_df to all_metrics_df
    evaluation =pd.concat([evaluation, metrics_df], ignore_index=True)
    
# Save the DataFrames to CSV files
evaluation.to_csv(PATH_EVALUATION_CSV, index=False)
evaluation_df_with_metrics.to_csv(PATH_EVALUATION_DF_WITH_METRICS_CSV, index=False)


In [None]:
# Decision Tree Regressor (model_dt):
# max_depth: Typically ranges from 1 to 32.
# min_samples_split: Typically ranges from 2 to 20.
# min_samples_leaf: Typically ranges from 1 to 10.
# max_features: Typically ranges from 1 to the number of features.

params_dt = {"max_depth": [5, 10, 15], 
             "min_samples_split" : [4, 6, 10 ],
             "max_features": [15,20]
             }

# Random Forest Regressor (model_rf):
# n_estimators: Typically ranges from 50 to 1000.
# max_depth: Typically ranges from 1 to 32.
# min_samples_split: Typically ranges from 2 to 20.
# min_samples_leaf: Typically ranges from 1 to 10.

params_rf = {'n_estimators': [75, 200, 500] ,
          'max_depth' : [5,10] ,
          'min_samples_split' : [4, 6, 8],
          }


# Gradient Boosting Regressor (model_gb):
# n_estimators: Typically ranges from 50 to 1000.
# learning_rate: Typically ranges from 0.01 to 0.1.
# max_depth: Typically ranges from 1 to 10.
# min_samples_split: Typically ranges from 2 to 20.

params_gb = {'n_estimators':[75, 150, 200, 500],
             'learning_rate' : [0.05, 0.1, 0.15],
             'max_depth' : [5, 10, 15],
             'min_samples_split' : [4, 6, 8],
             }

# XGBoost Regressor (model_xgb):
# n_estimators: Typically ranges from 50 to 1000.
# learning_rate: Typically ranges from 0.01 to 0.1.
# max_depth: Typically ranges from 1 to 10.
# min_child_weight: Typically ranges from 1 to 10.

params_xgb = {'n_estimators': [75, 150, 200, 500],
              'learning_rate' : [0.05, 0.1, 0.15],
              'max_depth' : [5, 10, 15],
              'min_child_weight' : [2,3,5]}

# CatBoost Regressor (model_cb):
# n_estimators: Typically ranges from 50 to 1000.
# learning_rate: Typically ranges from 0.01 to 0.1.
# max_depth: Typically ranges from 1 to 10.
# l2_leaf_reg: Typically ranges from 1 to 10.



In [None]:
# Define a function to perform grid search and save results
folder_names_params = {
    'experiment_DecisionTreeRegressor_20240303-151554': params_dt,
    'experiment_RandomForestRegressor_20240303-151806': params_rf,
    'experiment_GradientBoostingRegressor_20240303-151845': params_gb,
    'experiment_XGBRegressor_20240303-151849': params_xgb,
}

for folder, param_grid in folder_names_params.items():
    # Path to the zip file
    zip_file_path = f'src/evaluation/{folder}/model.zip'
    
    # Name of the file within the zip folder
    file_name_within_zip = 'model.pkl'

    # Open the zip file
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        # Extract the file from the zip folder
        with zip_ref.open(file_name_within_zip) as file:
            # Read the file using pandas
            pipeline = pickle.load(file)
    
    # Grid search for best parameters
    gs = GridSearchCV(estimator=pipeline,
                      param_grid=param_grid,
                      scoring='neg_root_mean_squared_error',
                    #   CV=5
                      )
    gs.fit(X_train, y_train)
    
    # Store the grid search results for the current model and folder
    grid_search_info = {
        'best_params': gs.best_params_,
        'best_score': gs.best_score_,
        'cv_results': gs.cv_results_,
    }

    # Save the grid search information to a JSON file within the folder
    grid_search_output_file = os.path.join(f'src/evaluation/{folder}', 'grid_search_info.json')
    with open(grid_search_output_file, 'w') as f:
        json.dump(grid_search_info, f)

    # Save in the model folder the graph with feature importance
    save_graph_feature_importance(model=model, X_train=X_train, folder=folder)

print("Grid search information saved successfully. Feature Importance graph saved successfully.")

