## Model Training
#### 1.1 Import Data and Required Packages
Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Import the CSV Data as Pandas DataFrame

In [2]:
df = pd.read_csv('../data/processed/processed_train_set.csv')
test_df = pd.read_csv('../data/processed/processed_test_set.csv')

In [3]:
df.head(5)

Unnamed: 0,inm_floor,inm_size,inm_price,inm_longitude,inm_latitude,his_price,his_quarterly_variation,his_annual_variation,his_monthly_variation,dem_Indice_de_reemplazo_de_la_poblacion_activa,...,inm_distrito_Retiro,inm_distrito_Salamanca,inm_distrito_San Blas - Canillejas,inm_distrito_Tetuán,inm_distrito_Unknown,inm_distrito_Usera,inm_distrito_Vicálvaro,inm_distrito_Villa de Vallecas,inm_distrito_Villaverde,inm_barrio_freq
0,0.032787,0.037625,0.028964,0.567127,0.336206,0.082037,0.561837,0.291667,0.481203,0.345298,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165722
1,0.081967,0.052676,0.017285,0.233771,0.335341,0.127693,0.664311,0.236111,0.466165,0.161933,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014873
2,0.065574,0.037625,0.005539,0.453518,0.270542,0.102154,0.579505,0.21875,0.473684,0.34748,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.037535
3,0.098361,0.052258,0.048986,0.448375,0.574744,0.493651,0.477032,0.211806,0.315789,0.132302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073654
4,0.016393,0.034699,0.026359,0.569365,0.508936,0.628763,0.657244,0.357639,0.466165,0.103351,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260623


In [4]:
# Split your features and target variable
X_train = df.drop('inm_price', axis=1)
y_train = df['inm_price']
X_test = test_df.drop('inm_price', axis=1)
y_test = test_df['inm_price']

In [5]:
y_train, y_test

(0        0.028964
 1        0.017285
 2        0.005539
 3        0.048986
 4        0.026359
            ...   
 14280    0.315937
 14281    0.008275
 14282    0.022224
 14283    0.058996
 14284    0.013281
 Name: inm_price, Length: 14285, dtype: float64,
 0       0.029632
 1       0.058329
 2       0.016584
 3       0.074012
 4       0.006841
           ...   
 3567    0.061666
 3568    0.007408
 3569    0.008943
 3570    0.044114
 3571    0.021289
 Name: inm_price, Length: 3572, dtype: float64)

#### Create an Evaluate Function to give all metrics after model Training

In [6]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [7]:
# Dictionary of models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBRegressor": XGBRegressor(),
    # "CatBoosting Regressor": CatBoostRegressor(verbose=False), # Uncomment if CatBoost is installed and desired
    "AdaBoost Regressor": AdaBoostRegressor()
}


# Lists to store model names and R2 scores
model_list = []
r2_list = []

# Loop through models, fit, predict, and evaluate
for name, model in models.items():
    model.fit(X_train, y_train) # Train model
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(name)
    model_list.append(name)
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.0292
- Mean Absolute Error: 0.0147
- R2 Score: 0.7396
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0286
- Mean Absolute Error: 0.0146
- R2 Score: 0.7439


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.0572
- Mean Absolute Error: 0.0378
- R2 Score: 0.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0564
- Mean Absolute Error: 0.0375
- R2 Score: -0.0000


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.0292
- Mean Absolute Error: 0.0147
- R2 Score: 0.7394
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0285
- Mean Absolute Error: 0.0146
- R2 Score: 0.7453






K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0212
- Mean Absolute Error: 0.0096
- R2 Score: 0.8632
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0239
- Mean Absolute Error: 0.0119
- R2 Score: 0.8210


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0003
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0292
- Mean Absolute Error: 0.0116
- R2 Score: 0.7326


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0075
- Mean Absolute Error: 0.0032
- R2 Score: 0.9827
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0194
- Mean Absolute Error: 0.0086
- R2 Score: 0.8816


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 0.0109
- Mean Absolute Error: 0.0060
- R2 Score: 0.9636
--

Based on the results above, here are some generalizations that we can make:

- Random Forest Regressor and XGBRegressor show the best overall performance among the models tested, with high R2 scores and low errors on both training and test sets. This indicates a strong predictive capability and generalization to unseen data.

- Decision Tree model shows perfect performance on the training set (R2 Score of 1.0000) but a significant drop in performance on the test set, indicating overfitting to the training data.

- K-Neighbors Regressor also performs well, particularly in reducing the Root Mean Squared Error (RMSE) and improving the R2 Score compared to linear models, suggesting that the relationship between features and target variable may not be linear.

- Linear Regression and Ridge have similar performance metrics, indicating a linear relationship between some of the predictors and the target variable. However, they are outperformed by ensemble and non-linear models.

- Lasso has the poorest performance with an R2 Score of 0.0000 on the training set and -0.0000 on the test set, which suggests that regularization penalized the model too much, leading to underfitting.

- AdaBoost Regressor shows moderate performance, better than Lasso but significantly lower than ensemble methods like Random Forest and XGBRegressor.

Model Choice: 

For predicting inm_price, our target variable, the Random Forest Regressor and XGBRegressor are the best based on their strong performance on unseen data, balancing the trade-off between bias and variance effectively.

However, will choose the random forest regressor as it shows the most impressive results. 


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 0.0077
- Mean Absolute Error: 0.0033
- R2 Score: 0.9821
-----------------------------
Model performance for Test set
- Root Mean Squared Error: 0.0205
- Mean Absolute Error: 0.0091
- R2 Score: 0.8676

The below code basically is for hyperparameter tuning of the random forest model, however since I already ran it and saw no improvements, I commented it out. 

In [8]:
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Assuming X_train, X_test, y_train, y_test are already defined

# # Define the parameter grid for Random Forest Regressor
# param_grid = {
#     'n_estimators': [100, 200],  # Reduced number of options
#     'max_depth': [None, 20],  # Fewer options
#     'min_samples_split': [2, 10],  # Broader steps
#     'min_samples_leaf': [1, 4],  # Fewer options
#     'max_features': ['sqrt', 'log2']  # Adjusted to remove 'auto'
# }


# # Initialize the Random Forest Regressor
# rf = RandomForestRegressor(random_state=42)

# # Initialize the Grid Search model
# grid_search = GridSearchCV(estimator=rf, 
#                            param_grid=param_grid, 
#                            cv=5, 
#                            scoring='r2', 
#                            n_jobs=-1, 
#                            verbose=2)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best parameters and the best score
# print("Best parameters:", grid_search.best_params_)
# print("Best score:", grid_search.best_score_)

# # Retrieve the best model
# best_model = grid_search.best_estimator_

# # Making predictions with the best model
# y_train_pred = best_model.predict(X_train)
# y_test_pred = best_model.predict(X_test)

# # Define a function to evaluate the model
# def evaluate_model(true, predicted):
#     mae = mean_absolute_error(true, predicted)
#     mse = mean_squared_error(true, predicted)
#     rmse = np.sqrt(mse)
#     r2_square = r2_score(true, predicted)
#     return mae, rmse, r2_square

# # Evaluate the best model
# train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
# test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

# # Print model performance
# print('Model performance for Training set')
# print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
# print("- Mean Absolute Error: {:.4f}".format(train_mae))
# print("- R2 Score: {:.4f}".format(train_r2))

# print('----------------------------------')

# print('Model performance for Test set')
# print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
# print("- Mean Absolute Error: {:.4f}".format(test_mae))
# print("- R2 Score: {:.4f}".format(test_r2))


Hyperparam tuning didn't help. We can look at the least important features and remove the one's that aren't important to the model to see if that makes a difference. 

In [9]:
from sklearn.ensemble import RandomForestRegressor

# Assuming X_train and y_train are already defined
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Summarize feature importances in a DataFrame
features_df = pd.DataFrame({'feature': X_train.columns, 'importance': importances})

# Sort the DataFrame to identify least important features
features_df = features_df.sort_values(by='importance', ascending=True)

# Display the least important features
print(features_df)


                                            feature    importance
49                          inm_distrito_Villaverde  5.318221e-07
47                           inm_distrito_Vicálvaro  1.129543e-06
39                           inm_distrito_Moratalaz  1.634565e-06
48                   inm_distrito_Villa de Vallecas  2.264776e-06
46                               inm_distrito_Usera  2.474088e-06
40                  inm_distrito_Puente de Vallecas  3.067330e-06
37                              inm_distrito_Latina  4.287278e-06
30                         inm_distrito_Carabanchel  4.960220e-06
29                             inm_distrito_Barajas  7.513773e-06
28                          inm_distrito_Arganzuela  1.005930e-05
43               inm_distrito_San Blas - Canillejas  1.239808e-05
44                              inm_distrito_Tetuán  2.458143e-05
35               inm_distrito_Fuencarral - El Pardo  5.080581e-05
34                       inm_distrito_Ciudad Lineal  7.329639e-05
41        

In [10]:
# Assuming features_df is sorted ascending by importance
threshold = features_df['importance'].quantile(0.01)  # Adjust quantile as needed
low_importance_features = features_df[features_df['importance'] <= threshold]['feature'].tolist()

X_train_filtered = X_train.drop(columns=low_importance_features)
X_test_filtered = X_test.drop(columns=low_importance_features)

# Retrain the model using the filtered datasets
model.fit(X_train_filtered, y_train)

# Evaluate the model
y_pred = model.predict(X_test_filtered)
print("R² on test set:", r2_score(y_test, y_pred))


R² on test set: 0.88304416460913


As seen above, the R2 value is similar, so we can just leave the values as they were, since there is barely any difference after dropping these. 

In [11]:
df.head()

Unnamed: 0,inm_floor,inm_size,inm_price,inm_longitude,inm_latitude,his_price,his_quarterly_variation,his_annual_variation,his_monthly_variation,dem_Indice_de_reemplazo_de_la_poblacion_activa,...,inm_distrito_Retiro,inm_distrito_Salamanca,inm_distrito_San Blas - Canillejas,inm_distrito_Tetuán,inm_distrito_Unknown,inm_distrito_Usera,inm_distrito_Vicálvaro,inm_distrito_Villa de Vallecas,inm_distrito_Villaverde,inm_barrio_freq
0,0.032787,0.037625,0.028964,0.567127,0.336206,0.082037,0.561837,0.291667,0.481203,0.345298,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165722
1,0.081967,0.052676,0.017285,0.233771,0.335341,0.127693,0.664311,0.236111,0.466165,0.161933,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014873
2,0.065574,0.037625,0.005539,0.453518,0.270542,0.102154,0.579505,0.21875,0.473684,0.34748,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.037535
3,0.098361,0.052258,0.048986,0.448375,0.574744,0.493651,0.477032,0.211806,0.315789,0.132302,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073654
4,0.016393,0.034699,0.026359,0.569365,0.508936,0.628763,0.657244,0.357639,0.466165,0.103351,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260623
