# Housing price Prediction
Data Source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [None]:
data_folder = "../data/HousingPrediction/"

In [346]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn import metrics

#### Read Data

In [347]:
train = pd.read_csv(data_folder+'train.csv')
test = pd.read_csv(data_folder+'test.csv')

#### 'SalePrice' is the target column

In [300]:
#separating SalePrice in Y
y = train['SalePrice']
train.drop(['SalePrice'], axis=1, inplace=True)

In [301]:
#Combine train and test data
data = pd.concat([train,test], axis=0)

In [302]:
## Features to keep
keep = ['MSSubClass', 'MSZoning', 'Neighborhood',
            'OverallQual', 'OverallCond', 'YearRemodAdd',
            'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
            'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
            'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
            'LotFrontage','YrSold']

In [303]:
data = data[keep].copy()

#### Numerical Imputer

In [305]:
#Imputing Missing values for following cols: ['LotFrontage']
num_features = ['LotFrontage']

for var in num_features:
    data[var].fillna(data[var].mode()[0], inplace=True)
    
    

#### Categorical Imputer

In [307]:
cat_features = ['MasVnrType', 'BsmtQual', 'BsmtExposure','FireplaceQu', 
                'GarageCars','GarageType', 'GarageFinish','MSZoning','BsmtFullBath',
                'KitchenQual']

In [308]:
#check missing values
data[cat_features].isnull().sum()

MasVnrType        24
BsmtQual          81
BsmtExposure      82
FireplaceQu     1420
GarageCars         1
GarageType       157
GarageFinish     159
MSZoning           4
BsmtFullBath       2
KitchenQual        1
dtype: int64

In [309]:
for var in cat_features:
    data[var].fillna(data[var].mode()[0], inplace=True)

In [310]:
data[cat_features].isnull().sum()

MasVnrType      0
BsmtQual        0
BsmtExposure    0
FireplaceQu     0
GarageCars      0
GarageType      0
GarageFinish    0
MSZoning        0
BsmtFullBath    0
KitchenQual     0
dtype: int64

#### Rare Label Categorical Encoder 

In [312]:
#Encoding for rare values : Abover a certain threshold percentage
features_to_encode = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive']

encoder_dict_ = {}
tol=0.05

for var in features_to_encode:
    # the encoder will learn the most frequent categories
    t = pd.Series(data[var].value_counts() / np.float(len(data)))
    # frequent labels:
    encoder_dict_[var] = list(t[t >= tol].index)
    
for var in features_to_encode:
    data[var] = np.where(data[var].isin(
                encoder_dict_[var]), data[var], 'Rare')


#### Categorical Encoder

In [313]:
features_to_encode = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive']

In [314]:
data['MSZoning'].value_counts()

RL      2269
RM       460
Rare     190
Name: MSZoning, dtype: int64

In [316]:
encoder_dict_ ={}
for var in features_to_encode:
    t = data[var].value_counts().sort_values(ascending=True).index  #Sorting on freq, should be done on target, just saving some time here
    encoder_dict_[var] = {k:i for i,k in enumerate(t,0)}

In [319]:
## Mapping using the encoder dictionary
for var in features_to_encode:
    data[var] = data[var].map(encoder_dict_[var])


#### Temporal Variable

In [323]:
temporal_features = ['YearRemodAdd']
comparison = 'YrSold'

data['YearRemodAdd'] = data['YearRemodAdd']-data['YrSold']
    


#### Drop Features

In [324]:
drop_features = ['YrSold']
data.drop(drop_features, axis=1, inplace=True)

#### Log Transformations of Numerical Variable

In [325]:
log_features = ['LotFrontage', '1stFlrSF', 'GrLivArea']
for var in log_features:
    data[var] = np.log(data[var])

## Split back to train and test

In [328]:
train_clean = data.iloc[:train.shape[0],:]

In [329]:
train_clean.shape

(1460, 23)

In [330]:
test_clean = data.iloc[train.shape[0]:,:]

In [331]:
test_clean.shape

(1459, 23)

#### Split Train Data

In [333]:
X_train, X_test, y_train, y_test = train_test_split(train_clean, y, random_state=42, test_size=0.15)

In [334]:
y_train = np.log(y_train)
y_test = np.log(y_test)

#### Run Model (Lasso)

In [335]:
model = Lasso(alpha=0.005, random_state=0)

In [336]:
model.fit(X_train,y_train)

Lasso(alpha=0.005, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [340]:
pred = model.predict(X_test)

#### Evaluation


In [345]:

#MSE
print("MSE : ",metrics.mean_squared_error(pred, y_test))
#MAE
print("MAE : ",metrics.mean_absolute_error(pred, y_test))
#RMSE
print("RMSE : ",np.sqrt(metrics.mean_squared_error(pred, y_test)))
#R2
print("R-sq : ",metrics.r2_score(pred, y_test))

MSE :  0.026143722511705714
MAE :  0.11889882181303771
RMSE :  0.1616902053672569
R-sq :  0.8234201723271777


#### Prediction on the actual Test Data

In [352]:
#test_clean is the transformed original test data; x_test is the 15% split from training data, 
#apologies for similar names
pred_test = np.exp(model.predict(test_clean))

In [354]:
pred_test[1:10]

array([134598.29967144, 159588.06090095, 184501.94256249, 185964.4420318 ,
       174491.67904766, 186467.67506267, 169938.0737666 , 198920.80602479,
       118518.53266662])