**ML MODELS WE ARE USING**
1. XGBOOST
2. K-nearest-neighbors
3. Random forest
4. SVR
5. Linear Regression

In [39]:
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, make_scorer
import pickle
from scipy.stats import randint, uniform, loguniform
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer # for imputing missing values!
import pandas as pd
import numpy as np


# function for the ML pipeline as outlined above 
def ML_pipe_kfold(train_data, ML_algo, preprocessor, param_dist, impute = False):
    '''
    This function finds the best ML model using GridSearchCV with KFold cross-validation.
    Returns a model to be pickled 
    '''
    
    if impute:
        reg = make_pipeline(preprocessor, IterativeImputer(), ML_algo)
    else:
        reg = make_pipeline(preprocessor, ML_algo)
    
    X = train_data.drop(columns = ['price'])
    y = train_data['price']
    
    param_dist = {f"{ML_algo.__class__.__name__.lower()}__{k}": v for k, v in param_dist.items()} # Fixing parameter grid so it works with sklearn pipe

    folds = KFold(n_splits = 4, shuffle=True, random_state=42)
    grid = RandomizedSearchCV(reg, param_distributions=param_dist, cv = folds, scoring = 'neg_root_mean_squared_error', n_iter = 60, n_jobs = -1, verbose = 1)
    grid.fit(X, y)
    print(f"Best test score: {grid.best_score_}")
    return grid.best_estimator_

In [None]:
train_data = pd.read_csv("Datasets/housing_original_train_0.csv")
test_data = pd.read_csv("Datasets/housing_original_test_0.csv")

#---- PREPROCESSOR ---
one_hot_fts = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
std_features = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking']
preprocessor = ColumnTransformer(
    [('one_hot', OneHotEncoder(), one_hot_fts),
    ("std", StandardScaler(), std_features)]
)

#---- XGBOOST ----
import xgboost as xgb
param_dist = {
    'n_estimators': randint(100, 300),  # Discrete uniform distribution
    'max_depth': randint(2, 6),  # Discrete uniform distribution, max exclusive
    'learning_rate': loguniform(0.001, 0.1),  # Continuous uniform distribution
    'subsample': uniform(0.5, 0.1),  # Continuous uniform distribution from 0.5 to 1
    'colsample_bytree': uniform(0.5, 0.1)  # Continuous uniform distribution from 0.5 to 1
}

xgb_reg = xgb.XGBRegressor()
#XGBoost is the only model that can handle missing values, so we will test it with and without imputing missing values
xgb_grid_no_impute = ML_pipe_kfold(train_data, xgb_reg, preprocessor, param_dist, impute = False)
xgb_grid_impute = ML_pipe_kfold(train_data, xgb_reg, preprocessor, param_dist, impute = True)

#---- RANDOM FOREST ----
from sklearn.ensemble import RandomForestRegressor

# Parameter distribution for Random Forest
rf_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

rf_reg = RandomForestRegressor()
rf_grid = ML_pipe_kfold(train_data, rf_reg, preprocessor, rf_param_dist, impute=True)

#---- LASSO/RIDGE REGRESSION ----
from sklearn.linear_model import Lasso, Ridge

# Parameter distribution for Lasso
lasso_param_dist = {
    'alpha': loguniform(1e-4, 1e3)
}

lasso_reg = Lasso(max_iter=10000)
lasso_grid = ML_pipe_kfold(train_data, lasso_reg, preprocessor, lasso_param_dist, impute=True)

# Parameter distribution for Ridge
ridge_param_dist = {
    'alpha': loguniform(1e-4, 1e3)
}

ridge_reg = Ridge(max_iter=10000)
ridge_grid = ML_pipe_kfold(train_data, ridge_reg, preprocessor, ridge_param_dist, impute=True)

#---- KNN -----
from sklearn.neighbors import KNeighborsRegressor

# Parameter distribution for KNN
knn_param_dist = {
    'n_neighbors': randint(3, 20),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn_reg = KNeighborsRegressor()
knn_grid = ML_pipe_kfold(train_data, knn_reg, preprocessor, knn_param_dist, impute=True)


#---- SVR -----
from sklearn.svm import SVR
from scipy.stats import loguniform

# Parameter distribution for SVR
svr_param_dist = {
    'C': loguniform(1e-4, 1e3),
    'gamma': loguniform(1e-4, 1e-1),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

svr_reg = SVR()
svr_grid = ML_pipe_kfold(train_data, svr_reg, preprocessor, svr_param_dist, impute=True)


In [66]:
list({"e": 1, "r": 2}.values())


[1, 2]