## Model Training
#### 1.1 Import Data and Required Packages
Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [102]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

Import the CSV Data as Pandas DataFrame

In [103]:
df = pd.read_csv('../data/processed/processed_train_set.csv')
test_df = pd.read_csv('../data/processed/processed_test_set.csv')

In [104]:
df.head(5)

Unnamed: 0,inm_floor,inm_size,inm_price,inm_latitude,his_price,his_quarterly_variation,his_annual_variation,his_monthly_variation,dem_Indice_de_dependencia,dem_TasaDeParo,...,inm_distrito_Retiro,inm_distrito_Salamanca,inm_distrito_San Blas - Canillejas,inm_distrito_Tetuán,inm_distrito_Unknown,inm_distrito_Usera,inm_distrito_Vicálvaro,inm_distrito_Villa de Vallecas,inm_distrito_Villaverde,inm_barrio_freq
0,0.032787,0.037625,0.028964,0.336206,0.082037,0.561837,0.291667,0.481203,0.214612,0.711957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165722
1,0.081967,0.052676,0.017285,0.335341,0.127693,0.664311,0.236111,0.466165,0.560637,0.426706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014873
2,0.065574,0.037625,0.005539,0.270542,0.102154,0.579505,0.21875,0.473684,0.309129,0.621847,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.037535
3,0.098361,0.052258,0.048986,0.574744,0.493651,0.477032,0.211806,0.315789,0.463007,0.250942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073654
4,0.016393,0.034699,0.026359,0.508936,0.628763,0.657244,0.357639,0.466165,0.47569,0.184045,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260623


In [105]:
# Split your features and target variable
X_train = df.drop('inm_price', axis=1)
y_train = df['inm_price']
X_test = test_df.drop('inm_price', axis=1)
y_test = test_df['inm_price']

In [106]:
y_train, y_test

(0        0.028964
 1        0.017285
 2        0.005539
 3        0.048986
 4        0.026359
            ...   
 14280    0.315937
 14281    0.008275
 14282    0.022224
 14283    0.058996
 14284    0.013281
 Name: inm_price, Length: 14285, dtype: float64,
 0       0.029632
 1       0.058329
 2       0.016584
 3       0.074012
 4       0.006841
           ...   
 3567    0.061666
 3568    0.007408
 3569    0.008943
 3570    0.044114
 3571    0.021289
 Name: inm_price, Length: 3572, dtype: float64)

#### Create an Evaluate Function to give all metrics after model Training

In [107]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [108]:
# Dictionary of models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBRegressor": XGBRegressor(),
    # "CatBoosting Regressor": CatBoostRegressor(verbose=False), # Uncomment if CatBoost is installed and desired
    "AdaBoost Regressor": AdaBoostRegressor()
}


# Lists to store model names and R2 scores
model_list = []
r2_list = []

# Loop through models, fit, predict, and evaluate
for name, model in models.items():
    model.fit(X_train, y_train) # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(name)
    model_list.append(name)
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.0293
- Mean Absolute Error: 0.0147
- R2 Score: 0.7381
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0287
- Mean Absolute Error: 0.0146
- R2 Score: 0.7415


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.0572
- Mean Absolute Error: 0.0378
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0564
- Mean Absolute Error: 0.0375
- R2 Score: -0.0000


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.0293
- Mean Absolute Error: 0.0148
- R2 Score: 0.7378
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0286
- Mean Absolute Error: 0.0146
- R2 Score: 0.7435


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0212
- Mean Absolute Error: 0.0097
- R2 Score: 0.8620
----------------------

In [109]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Assuming X_train, X_test, y_train, y_test are already defined

# # Define the parameter grid for Random Forest Regressor
# param_grid = {
#     'n_estimators': [100, 200],  # Reduced number of options
#     'max_depth': [None, 20],  # Fewer options
#     'min_samples_split': [2, 10],  # Broader steps
#     'min_samples_leaf': [1, 4],  # Fewer options
#     'max_features': ['sqrt', 'log2']  # Adjusted to remove 'auto'
# }


# # Initialize the Random Forest Regressor
# rf = RandomForestRegressor(random_state=42)

# # Initialize the Grid Search model
# grid_search = GridSearchCV(estimator=rf, 
#                            param_grid=param_grid, 
#                            cv=5, 
#                            scoring='r2', 
#                            n_jobs=-1, 
#                            verbose=2)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters and the best score
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

# # Retrieve the best model
# best_model = grid_search.best_estimator_

# # Making predictions with the best model
# y_train_pred = best_model.predict(X_train)
# y_test_pred = best_model.predict(X_test)

# # Define a function to evaluate the model
# def evaluate_model(true, predicted):
#     mae = mean_absolute_error(true, predicted)
#     mse = mean_squared_error(true, predicted)
#     rmse = np.sqrt(mse)
#     r2_square = r2_score(true, predicted)
#     return mae, rmse, r2_square

# # Evaluate the best model
# train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
# test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

# # Print model performance
# print('Model performance for Training set')
# print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
# print("- Mean Absolute Error: {:.4f}".format(train_mae))
# print("- R2 Score: {:.4f}".format(train_r2))

# print('----------------------------------')

# print('Model performance for Test set')
# print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
# print("- Mean Absolute Error: {:.4f}".format(test_mae))
# print("- R2 Score: {:.4f}".format(test_r2))


Hyperparam tuning didn't help. 

In [110]:
from sklearn.ensemble import RandomForestRegressor

# Assuming X_train and y_train are already defined
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Summarize feature importances in a DataFrame
features_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances})

# Sort the DataFrame to identify least important features
features_df = features_df.sort_values(by='importance', ascending=True)

# Display the least important features
print(features_df)


                                      feature  importance
44                    inm_distrito_Villaverde    0.000001
42                     inm_distrito_Vicálvaro    0.000002
43             inm_distrito_Villa de Vallecas    0.000002
34                     inm_distrito_Moratalaz    0.000002
41                         inm_distrito_Usera    0.000003
35            inm_distrito_Puente de Vallecas    0.000004
25                   inm_distrito_Carabanchel    0.000010
32                        inm_distrito_Latina    0.000010
23                    inm_distrito_Arganzuela    0.000016
38         inm_distrito_San Blas - Canillejas    0.000017
24                       inm_distrito_Barajas    0.000022
39                        inm_distrito_Tetuán    0.000027
30         inm_distrito_Fuencarral - El Pardo    0.000081
29                 inm_distrito_Ciudad Lineal    0.000105
36                        inm_distrito_Retiro    0.000163
26                        inm_distrito_Centro    0.000199
28            

In [111]:
# Assuming features_df is sorted ascending by importance
threshold = features_df['importance'].quantile(0.01)  # Adjust quantile as needed
low_importance_features = features_df[features_df['importance'] <= threshold]['feature'].tolist()

X_train_filtered = X_train.drop(columns=low_importance_features)
X_test_filtered = X_test.drop(columns=low_importance_features)

# Retrain the model using the filtered datasets
model.fit(X_train_filtered, y_train)

# Evaluate the model
y_pred = model.predict(X_test_filtered)
print("R² on test set:", r2_score(y_test, y_pred))


R² on test set: 0.8674734441033064


In [112]:
df.head()

Unnamed: 0,inm_floor,inm_size,inm_price,inm_latitude,his_price,his_quarterly_variation,his_annual_variation,his_monthly_variation,dem_Indice_de_dependencia,dem_TasaDeParo,...,inm_distrito_Retiro,inm_distrito_Salamanca,inm_distrito_San Blas - Canillejas,inm_distrito_Tetuán,inm_distrito_Unknown,inm_distrito_Usera,inm_distrito_Vicálvaro,inm_distrito_Villa de Vallecas,inm_distrito_Villaverde,inm_barrio_freq
0,0.032787,0.037625,0.028964,0.336206,0.082037,0.561837,0.291667,0.481203,0.214612,0.711957,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165722
1,0.081967,0.052676,0.017285,0.335341,0.127693,0.664311,0.236111,0.466165,0.560637,0.426706,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014873
2,0.065574,0.037625,0.005539,0.270542,0.102154,0.579505,0.21875,0.473684,0.309129,0.621847,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.037535
3,0.098361,0.052258,0.048986,0.574744,0.493651,0.477032,0.211806,0.315789,0.463007,0.250942,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073654
4,0.016393,0.034699,0.026359,0.508936,0.628763,0.657244,0.357639,0.466165,0.47569,0.184045,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260623
