# Hyperpararameter Turning


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, ShuffleSplit, StratifiedShuffleSplit, train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, f1_score

from tqdm import tqdm

## Data preparation

In [2]:
train = pd.read_csv('train.csv') #Load data
train.drop('Id', axis=1, inplace=True) #Drop ID column

# Change categorical variables from object type to category type
for column in train.select_dtypes(['object']).columns: 
        train[column] = train[column].astype('category')

    # Change certain numeric variables into categorical variables
to_be_category = ['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
                     'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold']
for column in to_be_category:
    train[column] = train[column].astype('category')

    # Replace NA's in numeric variables with the mean
train.LotFrontage.fillna(train.LotFrontage.mean(), inplace=True)
train.MasVnrArea.fillna(train.MasVnrArea.mean(), inplace=True)
train.GarageYrBlt.fillna(train.GarageYrBlt.mean(), inplace=True)

    # These NA's indicate that the house just doesn't have it
empty_means_without = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                           'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
for feature in empty_means_without:
    train[feature].cat.add_categories(['None'], inplace=True)
    train[feature].fillna('None', inplace=True)

train.dropna(inplace=True) #Drop any remaining NA's

train = pd.get_dummies(train) #One-hot encode
train = np.log(train + 1) #Deskew
train = train - train.mean()/(2*train.std()) #Scaling using Gelman's method of 2 SD

target = train['SalePrice']
features = train.drop(['SalePrice'], axis = 1)

## Cross-validation

Cross-validation is a method for getting a reliable estimate of model performance using only the training data. We will be looking at the most common one, the 10-fold cross-validation, which breaks the training data into 10 equal folds, essentially creating 10 miniature train/test splits.

In [3]:
models = {
    'Ridge' : Ridge(),
    'Lasso' : Lasso(),
    'KNN' : KNeighborsRegressor(),
    'Decision Tree' : DecisionTreeRegressor(),
    'SVM' : SVR(),
}

model_params = {
    'Ridge' : {'alpha': range(1,50)},
    'Lasso' : {'alpha' : np.linspace(1,1,20)},
    'KNN' : {'n_neighbors': range(1,20)},
    'Decision Tree' : {'max_depth': range(1,50)},
    'SVM' : {'C': np.arange(0,50,0.1)},
}

In [4]:
# Function to run 10 fold Grid Search CV on each model
def run_grid_search(model_name, x_train, x_test, y_train, y_test):
        
    model = models[model_name]
    params = model_params[model_name]
    gs = GridSearchCV(model, param_grid = params, cv = 10, return_train_score = True)
    gs.fit(x_train, y_train)
    
    return {
        'Model Name' : model_name,
        'Best Parameter' : gs.best_params_,
        'Train Score' : gs.best_score_,
        'Test Score' : gs.score(x_test, y_test)
    }

In [7]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = .3, random_state = 100)

results = []
for model_name in tqdm(models.keys()):
    results.append(run_grid_search(model_name, x_train, x_test, y_train, y_test))

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [55:10<00:00, 662.17s/it]


In [6]:
results = pd.DataFrame(results)
cols = ['Model Name','Best Parameter', 'Train Score', 'Test Score']
results = results[cols]
results

Unnamed: 0,Model Name,Best Parameter,Train Score,Test Score
0,Ridge,{'alpha': 6},0.887363,0.89474
1,Lasso,{'alpha': 1.0},-0.004885,-0.000534
2,KNN,{'n_neighbors': 3},0.613423,0.640427
3,Decision Tree,{'max_depth': 6},0.727341,0.765176
4,SVM,{'C': 49.900000000000006},0.05625,0.071986
