## Model Training
#### 1.1 Import Data and Required Packages
Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [30]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

Import the CSV Data as Pandas DataFrame

In [31]:
df = pd.read_csv('../data/processed/processed_df2.csv')

In [32]:
df.head(5)

Unnamed: 0,size_sqm,price,latitude,historical_price,demographic_unemployment_rate,demographic_proportion_without_university_education,demographic_proportion_without_education,demographic_proportion_born_outside_of_Spain,demographic_proportion_with_university_education,demographic_total_population,demographic_number_of_homes,demographic_density
0,141.0,990000,40.464347,3250,8.724674,0.488949,0.175632,15.456193,0.43983,40838,16155,161.894356
1,159.0,940000,40.419427,5106,9.006094,0.386598,0.083812,32.10246,0.52959,33418,16913,352.500616
2,98.0,549000,40.435362,4100,7.441379,0.365818,0.070351,18.224365,0.563831,42306,17851,263.952286
3,232.0,750000,40.424164,4773,6.709633,0.343683,0.066403,20.963846,0.589914,24423,10490,322.402577
4,183.0,1550000,40.413214,4739,9.05898,0.43375,0.082242,39.490947,0.484009,7622,3822,171.165183


In [33]:
# 'price' is the target variable and the rest are features
X = df.drop('price', axis=1)  # Features
y = df['price']               # Target variable

In [34]:
y = df['price']
y

0         990000
1         940000
2         549000
3         750000
4        1550000
          ...   
17852      94000
17853    1250000
17854     137260
17855     304880
17856    2490000
Name: price, Length: 17857, dtype: int64

In [35]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((14285, 11), (3572, 11))

#### Create an Evaluate Function to give all metrics after model Training

In [36]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [37]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBRegressor": XGBRegressor(), 
    # "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model
    

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 444164.6567
- Mean Absolute Error: 221724.0835
- R2 Score: 0.7302
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 435747.8260
- Mean Absolute Error: 218018.2037
- R2 Score: 0.7391


Lasso
Model performance for Training set
- Root Mean Squared Error: 444164.6640
- Mean Absolute Error: 221724.4414
- R2 Score: 0.7302
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 435747.1016
- Mean Absolute Error: 218021.1232
- R2 Score: 0.7391


Ridge
Model performance for Training set
- Root Mean Squared Error: 444191.7113
- Mean Absolute Error: 221788.0340
- R2 Score: 0.7302
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 435767.6008
- Mean Absolute Error: 218171.8520
- R2 Score: 0.7390


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 277082.5168
- Mean 

In [38]:
from sklearn.ensemble import RandomForestRegressor

# Assuming X_train and y_train are already defined
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [39]:
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

Feature ranking:
1. feature size_sqm (0.7003888173324395)
2. feature historical_price (0.09964999619952077)
3. feature latitude (0.08367578726659793)
4. feature demographic_proportion_without_university_education (0.0732325083083555)
5. feature demographic_unemployment_rate (0.01296478807641115)
6. feature demographic_total_population (0.007084572356994046)
7. feature demographic_proportion_with_university_education (0.006108088677634304)
8. feature demographic_proportion_without_education (0.004967407065072534)
9. feature demographic_density (0.004233240437624429)
10. feature demographic_proportion_born_outside_of_Spain (0.004081403781695454)
11. feature demographic_number_of_homes (0.003613390497654367)


In [40]:
from sklearn.feature_selection import SelectFromModel

# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.01
sfm = SelectFromModel(rf, threshold=0.01)

# Train the selector
sfm.fit(X_train, y_train)

In [41]:
X_train_transformed = sfm.transform(X_train)
X_test_transformed = sfm.transform(X_test)

# Now X_train_transformed and X_test_transformed contain only the features deemed important by
# your random forest model


In [42]:
# Retrain with selected features
rf_selected = RandomForestRegressor(n_estimators=100, random_state=42)
rf_selected.fit(X_train_transformed, y_train)

# Make predictions and evaluate the model as before
# e.g., using mean_squared_error, r2_score from sklearn.metrics


In [43]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming y_test is the true values of your test set
# Predict using the model trained with selected features
y_test_pred = rf_selected.predict(X_test_transformed)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_test_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate Mean Squared Error (MSE) and Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_test_pred)
rmse = mse ** 0.5  # Square root of MSE
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R-squared score
r2 = r2_score(y_test, y_test_pred)
print(f"R-squared Score: {r2}")


Mean Absolute Error (MAE): 143751.33200786717
Mean Squared Error (MSE): 113628491307.43178
Root Mean Squared Error (RMSE): 337088.25447860354
R-squared Score: 0.843850293744523
