In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from tqdm import tqdm
from time import time
train = pd.read_csv('train.csv') #Load data
train.drop('Id', axis=1, inplace=True) #Drop ID column

# Change categorical variables from object type to category type
for column in train.select_dtypes(['object']).columns: 
    train[column] = train[column].astype('category')

# Change certain numeric variables into categorical variables
to_be_category = ['MSSubClass', 'OverallQual', 'OverallCond', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 
                 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold']
for column in to_be_category:
    train[column] = train[column].astype('category')

# Replace NA's in numeric variables with the mean
train.LotFrontage.fillna(train.LotFrontage.mean(), inplace=True)
train.MasVnrArea.fillna(train.MasVnrArea.mean(), inplace=True)
train.GarageYrBlt.fillna(train.GarageYrBlt.mean(), inplace=True)

# These NA's indicate that the house just doesn't have it
empty_means_without = ['Alley','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1', 'BsmtFinType2', 'FireplaceQu',
                       'GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature']
for feature in empty_means_without:
    train[feature].cat.add_categories(['None'], inplace=True)
    train[feature].fillna('None', inplace=True)
train.dropna(inplace=True) #Drop any remaining NA's

In [None]:
train = pd.get_dummies(train) #One-hot encode
train = np.log(train + 1) #Deskew
train = train - train.mean()/(2*train.std()) #Scaling using Gelman's method of 2 SD

target = train['SalePrice']
features = train.drop(['SalePrice'], axis = 1)

# Running the Model
def run_model(model, model_name, features, target):
    
    x_train, x_test, y_train, y_test = train_test_split(features, target, random_state = 100)
    return {
            'Model' : model,
            'Model Name' : model_name,
            'Train Score' : model.score(x_train, y_train),
            'Test Score' : model.score(x_test, y_test),
    }
model_fit = []

model_fit.append(run_model(Ridge(), 'Ridge', features, target))
model_fit.append(run_model(Lasso(alpha = 0.1), 'Lasso', features, target))
model_fit.append(run_model(KNeighborsRegressor(), 'KNN', features, target))
model_fit.append(run_model(DecisionTreeRegressor(), 'Decision Tree', features, target))
model_fit.append(run_model(SVR(), 'SVR', features, target))

model_fit = pd.DataFrame(model_fit)
cols = ['Model Name','Model', 'Train Score', 'Test Score']
model_fit = model_fit[cols]
model_fit