# Model Summary 

This notebook provides a summary of the procedures to predict the housing prices 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.externals import joblib 

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Preprocessing 

### Splitting the data into training and test sets

In [3]:
x_train, x_test, y_train,y_test = train_test_split(data, data['SalePrice'], test_size=0.1, random_state=0)

In [4]:
#Load the features that we already were working with before 
features = pd.read_csv('selected_features.csv', header=None)
features

Unnamed: 0,0
0,MSSubClass
1,MSZoning
2,Neighborhood
3,OverallQual
4,YearRemodAdd
5,RoofStyle
6,MasVnrType
7,ExterQual
8,BsmtQual
9,BsmtExposure


In [5]:
ft = [x for x in features[0]] + ['LotFrontage']

### Missing values fill for categorical variables 

In [6]:
cat_var = [var for var in ft if data[var].dtypes == 'O' and data[var].isnull().sum() > 1]

In [7]:
for var in cat_var:
    print(var, np.round(x_train[var].isnull().mean() , 3), "% of missing values" )

MasVnrType 0.005 % of missing values
BsmtQual 0.024 % of missing values
BsmtExposure 0.025 % of missing values
FireplaceQu 0.473 % of missing values
GarageType 0.056 % of missing values
GarageFinish 0.056 % of missing values


In [8]:
def fill_cat_na(data,var):
    df = data.copy()
    df[var] = df[var].fillna("missing")
    return df

x_train = fill_cat_na(x_train, cat_var)
x_test = fill_cat_na(x_test, cat_var)

In [9]:
for var in cat_var:
    print(var, x_train[var].isnull().sum())

MasVnrType 0
BsmtQual 0
BsmtExposure 0
FireplaceQu 0
GarageType 0
GarageFinish 0


## Missing value fill for numerical data 

In [10]:
num_var = [var for var in ft if data[var].dtypes != 'O' and data[var].isnull().sum() > 0] 
num_var

['LotFrontage']

In [11]:
#Saving the value for analysis 
mean_var_dict = {}

def fill_na_num(data,var):
    df = data.copy()
    temp = data[var].mode()[0]
    mean_var_dict[var] = temp
    df[var].fillna(temp, inplace=True)
    return df
    
for var in num_var:
    x_train = fill_na_num(x_train,var)
    x_test = fill_na_num(x_test, var)
    
np.save('mean_var_dict.npy', mean_var_dict)

## Temporal Variables 

In [12]:
def elapsed_years(data,var):
    df = data.copy()
    df[var] = df['YrSold'] - df[var]
    return df

x_train['YearRemodAdd'] = elapsed_years(x_train, 'YearRemodAdd')
x_test['YearRemodAdd'] = elapsed_years(x_test, 'YearRemodAdd')

## Log transform numerical variables 

In [13]:
for var in ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']:
    x_train[var] = np.log(x_train[var])
    x_test[var]= np.log(x_test[var])

## Removing rare categorical variables 