# Housing price Prediction
### Cleaning up the code
Data Source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques

#### Import Libraries

In [2]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn import metrics

#### Config Variables
Only once place where all variables are assigned a value to be used later in code.  
This will be useful later when you should keep a separate config file for all these variables  

All the config variables are mentioned in UPPERCASE

In [3]:
DATAPATH = "../data/HousingPrediction/"
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

TARGET = 'SalePrice'
## Features to keep
KEEP = ['MSSubClass', 'MSZoning', 'Neighborhood',
            'OverallQual', 'OverallCond', 'YearRemodAdd',
            'RoofStyle', 'MasVnrType', 'BsmtQual', 'BsmtExposure',
            'HeatingQC', 'CentralAir', '1stFlrSF', 'GrLivArea',
            'BsmtFullBath', 'KitchenQual', 'Fireplaces', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageCars', 'PavedDrive',
            'LotFrontage','YrSold'] #Final feature to keep in data

NUMERICAL_FEATURES = ['LotFrontage'] #Numerical
CATEGORICAL_FEATURES = ['MasVnrType', 'BsmtQual', 'BsmtExposure','FireplaceQu', 
                'GarageCars','GarageType', 'GarageFinish','MSZoning','BsmtFullBath',
                'KitchenQual'] #Categorical

FEATURES_TO_ENCODE = ['MSZoning', 'Neighborhood', 'RoofStyle', 'MasVnrType','BsmtQual', 
                      'BsmtExposure', 'HeatingQC', 'CentralAir','KitchenQual', 'FireplaceQu', 
                      'GarageType', 'GarageFinish','PavedDrive'] #Features to Encode

TEMPORAL_FEATURES = ['YearRemodAdd']
TEMPORAL_COMPARISON = 'YrSold'

LOG_FEATURES = ['LotFrontage', '1stFlrSF', 'GrLivArea'] #Features for Log Transform

DROP_FEATURES = ['YrSold'] #Features to Drop


#### Read Data

In [4]:
train = pd.read_csv(DATAPATH+TRAIN_FILE)
test = pd.read_csv(DATAPATH+TEST_FILE)

In [5]:
#separating SalePrice in Y
y = train[TARGET]
train.drop([TARGET], axis=1, inplace=True)

In [6]:
#Combine train and test data
data = pd.concat([train,test], axis=0)

In [7]:
data = data[KEEP].copy()

#### Numerical Imputer

In [8]:
for var in NUMERICAL_FEATURES:
    data[var].fillna(data[var].mode()[0], inplace=True)

#### Categorical Imputer

In [9]:
for var in CATEGORICAL_FEATURES:
    data[var].fillna(data[var].mode()[0], inplace=True)

#### Rare Label Categorical Encoder 

In [10]:
encoder_dict_ = {}
tol=0.05

for var in FEATURES_TO_ENCODE:
    # the encoder will learn the most frequent categories
    t = pd.Series(data[var].value_counts() / np.float(len(data)))
    # frequent labels:
    encoder_dict_[var] = list(t[t >= tol].index)
    
for var in FEATURES_TO_ENCODE:
    data[var] = np.where(data[var].isin(
                encoder_dict_[var]), data[var], 'Rare')

#### Categorical Encoder

In [12]:
encoder_dict_ ={}
for var in FEATURES_TO_ENCODE:
    t = data[var].value_counts().sort_values(ascending=True).index  #Sorting on freq, should be done on target, just saving some time here
    encoder_dict_[var] = {k:i for i,k in enumerate(t,0)}

In [15]:
## Mapping using the encoder dictionary
for var in FEATURES_TO_ENCODE:
    data[var] = data[var].map(encoder_dict_[var])

#### Temporal Variables

In [16]:

for var in TEMPORAL_FEATURES:
    data[var] = data[var]-data[TEMPORAL_COMPARISON]

#### Log Transformation of Numerical Features

In [17]:
for var in LOG_FEATURES:
    data[var] = np.log(data[var])

#### Drop Features

In [18]:
data.drop(DROP_FEATURES, axis=1, inplace=True)

## Split Train and Test

In [19]:
train_clean = data.iloc[:train.shape[0],:]
test_clean = data.iloc[train.shape[0]:,:]

#### Split Train data

In [20]:
X_train, X_test, y_train, y_test = train_test_split(train_clean, y, random_state=42, test_size=0.15)

In [21]:
y_train = np.log(y_train)
y_test = np.log(y_test)

#### Run Model

In [22]:
model = Lasso(alpha=0.005, random_state=0)

In [23]:
model.fit(X_train,y_train)
pred = model.predict(X_test)

#### Model Evaluation

In [24]:
#MSE
print("MSE : ",metrics.mean_squared_error(pred, y_test))
#MAE
print("MAE : ",metrics.mean_absolute_error(pred, y_test))
#RMSE
print("RMSE : ",np.sqrt(metrics.mean_squared_error(pred, y_test)))
#R2
print("R-sq : ",metrics.r2_score(pred, y_test))

MSE :  0.026143722511705714
MAE :  0.11889882181303771
RMSE :  0.1616902053672569
R-sq :  0.8234201723271777


#### Prediction on the actual Train Data

In [25]:
#test_clean is the transformed original test data; x_test is the 15% split from training data, 
#apologies for similar names
pred_test = np.exp(model.predict(test_clean))

In [26]:
pred_test[1:10]

array([134598.29967144, 159588.06090095, 184501.94256249, 185964.4420318 ,
       174491.67904766, 186467.67506267, 169938.0737666 , 198920.80602479,
       118518.53266662])