# 05_Kaggle_Submission

## Import Libraries

In [793]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.preprocessing import  StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet


from sklearn.metrics import mean_squared_error

## Load in Test Dataset

In [794]:
kaggle_test = pd.read_csv('./datasets/test.csv')

## Clean Test Dataset

In [795]:
numerical_categories = ["MS SubClass","Overall Qual","Overall Cond","Bsmt Full Bath","Bsmt Half Bath","Full Bath","Half Bath","Bedroom AbvGr","Kitchen AbvGr","Garage Cars"]

word_categories = [col for col in nominal_features if col not in numerical_categories]


In [796]:
kaggle_test['age_since_built'] = kaggle_test['Yr Sold'] - kaggle_test['Year Built']

In [797]:
kaggle_test['age_since_remodel'] = kaggle_test['Yr Sold'] - kaggle_test['Year Remod/Add']

In [798]:
kaggle_test['garage_age'] = kaggle_test.apply(lambda x: (x['Yr Sold'] - x['Year Remod/Add']) if type(x['Garage Yr Blt']) == 'nan' else (x['Yr Sold'] - x['Garage Yr Blt']) , axis=1)


In [799]:
kaggle_test['garage_age'] = kaggle_test['garage_age'].map(lambda x : x if x>=0 else 0)

In [800]:
kaggle_test['has_remod'] = kaggle_test.apply(lambda x: 0 if x.age_since_built - x.age_since_remodel == 0 else 1, axis=1)


In [801]:
nominal_features.append('has_remod')

In [802]:
kaggle_test[numerical_categories]=kaggle_test[numerical_categories].replace(np.nan,0)


In [803]:
kaggle_test[word_categories]=kaggle_test[word_categories].replace(np.nan,'missing')


In [804]:
kaggle_test['outdoor_SF'] = kaggle_test['Wood Deck SF'] + kaggle_test['Open Porch SF'] + kaggle_test['Screen Porch']

In [805]:
quality_codes = {
    'Ex' : 5,
    'Gd' : 4,
    'TA' : 3,
    'Fa' : 2,
    'Po' : 1,
    'NA' : 0,
    'missing' :0,
    np.nan : 0
}

In [806]:
ordinal_quality_features = ['Exter Qual','Exter Cond','Bsmt Qual','Bsmt Cond','Heating QC','Kitchen Qual','Fireplace Qu',
                           'Garage Qual', 'Garage Cond','Pool QC']

In [807]:
for feature in ordinal_quality_features:
    kaggle_test[feature] = kaggle_test[feature].map(quality_codes)

In [808]:
kaggle_test['Garage Finish'] = kaggle_test['Garage Finish'].map({'Fin':3,'Rfn':2,'Unf':1,'NA':0,'missing':0})

In [809]:
kaggle_test['Lot Shape'] = kaggle_test['Lot Shape'].map({'Reg':4,'IR1':3,'IR2':2,'IR3':1,'missing':0})

In [810]:
kaggle_test['Land Slope'] = kaggle_test['Land Slope'].map({'Gtl':3,'Mod':'2','Sev':'1','missing':0})

In [811]:
kaggle_test['Bsmt Exposure'] = kaggle_test['Bsmt Exposure'].map({'Gd':5,'Av':4,'Mn':3,'No':2,'NA':1,'missing':0})

In [812]:
kaggle_test['BsmtFin Type 1'] = kaggle_test['BsmtFin Type 1'].map({'GLQ':7,'ALQ':6,'BLQ':5,'Rec':4,'LWQ':3,'Unf':2,'NA':1,'missing':0})

In [813]:
kaggle_test['BsmtFin Type 2'] = kaggle_test['BsmtFin Type 2'].map({'GLQ':7,'ALQ':6,'BLQ':5,'Rec':4,'LWQ':3,'Unf':2,'NA':1,'missing':0})

From earlier EDA visualisations, we see that certain features have a low distribution of observations within categories.

In the case of House Style, we can re-bin in to 2story or 1story type.

In [814]:
style_dict = {
    '1Story' : '1Story',
    '1.5Fin' : '1Story',
    '1.5Unf' : '1Story',
    '2Story' : '2Story',
    '2.5Fin' : '2Story',
    '2.5Unf' : '2Story',
    'SFoyer' : 'Split',
    'SLvl'   : 'Split'
}

kaggle_test['House Style'] = kaggle_test['House Style'].map(style_dict)

We can combine external condition and external quality as a sum of the two features in a feature : `exter_score`

In [815]:
kaggle_test['house_score'] = kaggle_test['Overall Qual'] + kaggle_test['Overall Cond']

In [816]:
kaggle_test['exter_score'] = kaggle_test['Exter Qual'] + kaggle_test['Exter Cond']

In [817]:
kaggle_test['fireplace_score'] = kaggle_test['Fireplace Qu'] * kaggle_test['Fireplaces']

In [818]:
kaggle_test['garage_score'] = round((kaggle_test['Garage Qual'] + kaggle_test['Garage Cond'] + kaggle_test['Garage Finish']) / kaggle_test['Garage Cars'],2)


In [819]:
kaggle_test['kitchen_score'] = kaggle_test['Kitchen AbvGr'] * kaggle_test['Kitchen Qual']

In [820]:
kaggle_test['total_baths'] = kaggle_test['Full Bath'] + kaggle_test['Half Bath']

In [821]:
kaggle_test['basement_score'] = round(( kaggle_test['Bsmt Cond'] + kaggle_test['Bsmt Qual'] ) ** (kaggle_test['BsmtFin Type 1'] / 7),2)

Rebinning Bedrooms. Bedrooms more than four are just '>4'

In [822]:
kaggle_test['TotRms AbvGrd']= kaggle_test['TotRms AbvGrd'].map(lambda x : '>=8' if x>=8 else ('<=4' if x<=4 else x))


In [823]:
kaggle_test['Bedroom AbvGr']= kaggle_test['Bedroom AbvGr'].map(lambda x : '>=4' if x>=4 else x)


In [824]:
# rebin total_baths
kaggle_test['total_baths'] = kaggle_test['total_baths'].map(lambda x : '>=3' if x>=3 else x)

In [825]:
# rebin months to seasons
# https://www.timeanddate.com/calendar/aboutseasons.html
# Spring runs from March 1 to May 31;
# Summer runs from June 1 to August 31;
# Fall (autumn) runs from September 1 to November 30; and
# Winter runs from December 1 to February 28 (February 29 in a leap year).

def month_to_season(int_month):
    if (3<=int_month and int_month<=5):
        return 'spring'
    elif (6<=int_month and int_month<=8):
        return 'summer'
    elif (9<=int_month and int_month<=11):
        return 'fall'
    else:
        return 'winter'
    
kaggle_test['Mo Sold'] = kaggle_test['Mo Sold'].map(month_to_season)

In [826]:
# convert year to categorical

kaggle_test['Yr Sold'] = kaggle_test['Yr Sold'].astype('category')

In [827]:
new_nom_feature = ['Lot Shape','Land Contour',
 'House Style','Heating QC','Central Air',
 'Bedroom AbvGr','TotRms AbvGrd','Mo Sold',
 'has_remod','Yr Sold','house_score',
 'exter_score','fireplace_score',
 'garage_score','kitchen_score',
 'total_baths','basement_score']

In [828]:
# fix NaNs in scores

kaggle_test['garage_score'] = kaggle_test['garage_score'].map(lambda x: 0 if np.isnan(x) else x)
kaggle_test['basement_score'] = kaggle_test['garage_score'].map(lambda x: 0 if np.isnan(x) else x)

In [829]:
# combined features 

kaggle_test['garage_basement_score'] = kaggle_test['garage_score'] * kaggle_test['basement_score']
kaggle_test['house_exter_score'] = kaggle_test['house_score'] * kaggle_test['exter_score']
kaggle_test['lot_area_frontage'] = kaggle_test['Lot Frontage'] * kaggle_test['Lot Area']


In [830]:
updated_numerical_columns = ['lot_area_frontage','Mas Vnr Area','Total Bsmt SF','Gr Liv Area',
                            'Garage Area','outdoor_SF','age_since_remodel','age_since_built',
                            'Heating QC','house_exter_score','fireplace_score',
                             'garage_basement_score','kitchen_score']

ordinal_columns = ['Lot Shape', 'Heating QC','house_score','exter_score','fireplace_score',
                  'garage_score','kitchen_score','basement_score']

special_column = ['Id']

nominal_columns = [col for col in new_nom_feature if col not in ordinal_columns]

In [831]:
nominal_features = ['MS SubClass','MS Zoning','Street',
                   'Alley','Lot Shape','Land Contour',
                   'Utilities','Lot Config','Land Slope',
                   'Neighborhood','Condition 1','Condition 2',
                   'Bldg Type','House Style','Overall Qual',
                   'Overall Cond',
                   'Roof Style','Roof Matl','Exterior 1st',
                   'Exterior 2nd', 'Mas Vnr Type','Exter Qual',
                   'Exter Cond','Foundation','Bsmt Qual','Bsmt Cond',
                   'Bsmt Exposure','BsmtFin Type 1','BsmtFin Type 2',
                   'Heating','Heating QC','Central Air','Electrical',
                   'Bsmt Full Bath','Bsmt Half Bath','Full Bath',
                   'Half Bath','Bedroom AbvGr','Kitchen AbvGr','Kitchen Qual',
                   'TotRms AbvGrd','Functional','Fireplaces',
                   'Fireplace Qu','Garage Type',
                   'Garage Finish','Garage Cars','Garage Qual',
                   'Garage Cond','Paved Drive','Pool QC','Fence',
                   'Misc Feature','Mo Sold',
                   'Sale Type']

temporal_features = ['Year Built','Year Remod/Add','Garage Yr Blt','Yr Sold']

non_numeric_features  = nominal_features + temporal_features

# convert nominal features into category datatype
for f in nominal_features:
    kaggle_test[f] = kaggle_test[f].astype("category")

In [832]:
# get dummies for the nominal columns
nominal_kaggle = pd.get_dummies(kaggle_test[nominal_columns],drop_first=True)

In [833]:
# extract out numerical columns for original dataframe

numerical_kaggle = kaggle_test[updated_numerical_columns].copy()

In [834]:
trsfm_kaggle = nominal_kaggle.join(numerical_kaggle)

In [835]:
trsfm_kaggle = trsfm_kaggle.join(kaggle_test[special_column])

In [836]:
trsfm_kaggle_id = trsfm_kaggle['Id']
trsfm_kaggle_features = trsfm_kaggle[[col for col in trsfm_kaggle.columns if col !='Id']]

In [837]:
x_test = trsfm_kaggle_features.copy()

## Load in Training Set

In [841]:
train = pd.read_csv('./datasets/transformed_train.csv')
train_id = pd.read_csv('./datasets/train.csv')

In [842]:
# drop unnamed
train.drop(columns=['Unnamed: 0'],inplace=True)

y_train = train['SalePrice']
features = [col for col in df.columns if col != 'SalePrice']
x_train = train[features]
train_id = train_id['Id']
x_train_id = x_train.join(train_id)

## Prepocessing

In [851]:
nominal_features = ['has_remod', 'Land Contour_HLS', 'Land Contour_Low', 'Land Contour_Lvl',
       'House Style_2Story', 'House Style_Split', 'Central Air_Y',
       'Bedroom AbvGr_1', 'Bedroom AbvGr_2', 'Bedroom AbvGr_3',
       'Bedroom AbvGr_>=4', 'TotRms AbvGrd_6', 'TotRms AbvGrd_7',
       'TotRms AbvGrd_<=4', 'TotRms AbvGrd_>=8', 'Mo Sold_spring',
       'Mo Sold_summer', 'Mo Sold_winter', 'Yr Sold_2007', 'Yr Sold_2008',
       'Yr Sold_2009', 'Yr Sold_2010', 'total_baths_1', 'total_baths_2',
       'total_baths_>=3']

cont_features = ['lot_area_frontage', 'Mas Vnr Area', 'Total Bsmt SF',
       'Gr Liv Area', 'Garage Area', 'outdoor_SF', 'age_since_remodel',
       'age_since_built', 'Heating QC', 'house_exter_score', 'fireplace_score',
       'garage_basement_score', 'kitchen_score']



In [930]:
si = SimpleImputer(missing_values=np.nan,strategy='median')
x_train_imputed = si.fit_transform(x_train[features])
x_test_imputed = si.transform(x_test[features])

In [931]:
x_train_imputed = pd.DataFrame(x_train_imputed,columns=features).copy()
x_test_imputed = pd.DataFrame(x_test_imputed,columns=features).copy()

In [932]:
x_train_imputed['Total Bsmt SF'] = x_train_imputed['Total Bsmt SF'].map(lambda x : np.median(x_train_imputed['Total Bsmt SF']) if x ==0 else x)

In [933]:
export_train_impute = x_train_imputed.join(y_train)
export_train_impute.to_csv('./datasets/imputed_train.csv')

In [847]:
# scale only numerical variables

x_train_scaled_numerical = x_train_imputed[cont_features].copy()
x_test_scaled_numerical = x_test_imputed[cont_features].copy()

In [848]:
ss = StandardScaler()
x_train_scaled_numerical = ss.fit_transform(x_train_scaled_numerical)
x_test_scaled_numerical = ss.transform(x_test_scaled_numerical)

In [849]:
x_train_scaled_numerical = pd.DataFrame(x_train_scaled_numerical,columns=cont_features).copy()
x_test_scaled_numerical = pd.DataFrame(x_test_scaled_numerical,columns=cont_features).copy()

In [852]:
x_train_transformed = x_train_scaled_numerical.join(x_train[nominal_features])
x_test_transformed = x_test_scaled_numerical.join(x_test[nominal_features])

## Generate Model

### Baseline

Use LinearRegression to generate a baseline prediction

In [858]:
lr = LinearRegression()
benchmark = lr.fit(x_train_transformed,y_train)
target_benchmark_price = lr.predict(x_test_transformed)

In [876]:
kaggle_benchmark = pd.DataFrame(zip(trsfm_kaggle_id,target_benchmark_price),columns=['Id','SalePrice'])

In [881]:
kaggle_benchmark = kaggle_benchmark.set_index('Id')

In [882]:
kaggle_benchmark.to_csv('./kaggle/benchmark.csv')

## Ridge Regression

Use Ridge Regression  to generate improved model

In [887]:
rr = Ridge()


In [888]:
r_alphas = np.logspace(0, 20, 1000)

In [889]:
params = {
    'alpha':r_alphas
}

In [894]:
grid_search_ridge = GridSearchCV(estimator=rr, param_grid=params,cv=5)

In [895]:
grid_search_ridge.fit(x_train_transformed,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': array([1.00000000e+00, 1.04717682e+00, 1.09657929e+00, 1.14831241e+00,
       1.20248614e+00, 1.25921561e+00, 1.31862140e+00, 1.38082977e+00,...
       4.16504425e+19, 4.36153779e+19, 4.56730127e+19, 4.78277202e+19,
       5.00840799e+19, 5.24468875e+19, 5.49211648e+19, 5.75121707e+19,
       6.02254120e+19, 6.30666554e+19, 6.60419396e+19, 6.91575883e+19,
       7.24202233e+19, 7.58367791e+19, 7.94145172e+19, 8.31610415e+19,
       8.70843150e+19, 9.11926760e+19, 9.54948564e+19, 1.00000000e+20])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [896]:
best_ridge = grid_search_ridge.best_estimator_

In [899]:
target_ridge_price = best_ridge.predict(x_test_transformed)

In [900]:
kaggle_ridge = pd.DataFrame(zip(trsfm_kaggle_id,target_ridge_price),columns=['Id','SalePrice'])

In [901]:
kaggle_ridge = kaggle_ridge.set_index('Id')

In [902]:
kaggle_ridge.to_csv('./kaggle/ridge.csv')

## Business Model

In [907]:
rfe_columns = ['Total Bsmt SF', 'Gr Liv Area', 'age_since_built', 'house_exter_score',
       'Land Contour_HLS', 'Land Contour_Low', 'Land Contour_Lvl']

x_train_rfe = x_train_transformed[rfe_columns]
x_test_rfe = x_test_transformed[rfe_columns]

In [911]:
lr_rfe = LinearRegression()
benchmark = lr_rfe.fit(x_train_rfe,y_train)
target_linear_rfe_price = lr_rfe.predict(x_test_rfe)

In [None]:
lr_rfe.predict

In [912]:
kaggle_linear_rfe = pd.DataFrame(zip(trsfm_kaggle_id,target_linear_rfe_price),columns=['Id','SalePrice'])

In [913]:
kaggle_linear_rfe = kaggle_linear_rfe.set_index('Id')

In [914]:
kaggle_linear_rfe.to_csv('./kaggle/linear_rfe.csv')

In [935]:
prod_model_rfe = pd.DataFrame(zip(rfe_columns,lr_rfe.coef_),columns=['feature','coefficient'])

In [936]:
prod_model_rfe = prod_model_rfe.append({'feature':'y_intercept','coefficient':lr_rfe.intercept_},ignore_index=True)

In [937]:
prod_model_rfe

Unnamed: 0,feature,coefficient
0,Total Bsmt SF,15864.667337
1,Gr Liv Area,30824.380459
2,age_since_built,-20019.703415
3,house_exter_score,25762.168495
4,Land Contour_HLS,48572.70469
5,Land Contour_Low,36453.726086
6,Land Contour_Lvl,12958.893938
7,y_intercept,167047.745662
