# Model Summary 

This notebook provides a summary of the procedures to predict the housing prices 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.externals import joblib 

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Preprocessing 

### Splitting the data into training and test sets

In [3]:
x_train, x_test, y_train,y_test = train_test_split(data, data['SalePrice'], test_size=0.1, random_state=0)

In [4]:
#Load the features that we already were working with before 
features = pd.read_csv('selected_features.csv', header=None)
features

Unnamed: 0,0
0,MSSubClass
1,MSZoning
2,Neighborhood
3,OverallQual
4,YearRemodAdd
5,RoofStyle
6,MasVnrType
7,ExterQual
8,BsmtQual
9,BsmtExposure


In [5]:
ft = [x for x in features[0]] + ['LotFrontage']

### Missing values fill for categorical variables 

In [6]:
cat_var = [var for var in ft if data[var].dtypes == 'O' and data[var].isnull().sum() > 1]

In [7]:
for var in cat_var:
    print(var, np.round(x_train[var].isnull().mean() , 3), "% of missing values" )

MasVnrType 0.005 % of missing values
BsmtQual 0.024 % of missing values
BsmtExposure 0.025 % of missing values
FireplaceQu 0.473 % of missing values
GarageType 0.056 % of missing values
GarageFinish 0.056 % of missing values


In [8]:
def fill_cat_na(data,var):
    df = data.copy()
    df[var] = df[var].fillna("missing")
    return df

x_train = fill_cat_na(x_train, cat_var)
x_test = fill_cat_na(x_test, cat_var)

In [9]:
for var in cat_var:
    print(var, x_train[var].isnull().sum())

MasVnrType 0
BsmtQual 0
BsmtExposure 0
FireplaceQu 0
GarageType 0
GarageFinish 0


## Missing value fill for numerical data 

In [10]:
num_var = [var for var in ft if data[var].dtypes != 'O' and data[var].isnull().sum() > 0] 
num_var

['LotFrontage']

In [11]:
#Saving the value for analysis 
mean_var_dict = {}

def fill_na_num(data,var):
    df = data.copy()
    temp = data[var].mode()[0]
    mean_var_dict[var] = temp
    df[var].fillna(temp, inplace=True)
    return df
    
for var in num_var:
    x_train = fill_na_num(x_train,var)
    x_test = fill_na_num(x_test, var)
    
np.save('mean_var_dict.npy', mean_var_dict)

## Temporal Variables 

In [12]:
def elapsed_years(data,var):
    df = data.copy()
    df[var] = df['YrSold'] - df[var]
    return df

x_train['YearRemodAdd'] = elapsed_years(x_train, 'YearRemodAdd')
x_test['YearRemodAdd'] = elapsed_years(x_test, 'YearRemodAdd')

## Log transform numerical variables 

In [13]:
for var in ['LotFrontage', '1stFlrSF', 'GrLivArea', 'SalePrice']:
    x_train[var] = np.log(x_train[var])
    x_test[var]= np.log(x_test[var])

## Removing rare categorical variables 

In [18]:
cat_vars = [var for var in ft if x_train[var].dtype == 'O']
cat_vars

['MSZoning',
 'Neighborhood',
 'YearRemodAdd',
 'RoofStyle',
 'MasVnrType',
 'ExterQual',
 'BsmtQual',
 'BsmtExposure',
 'HeatingQC',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'PavedDrive']

In [29]:
def find_freq(data,var,p):
    df = data.copy()
    calc = df.groupby(var)['SalePrice'].count()/ len(df)
    return calc[calc>p].index

In [30]:
frequent_labels_dict={}

for var in cat_vars:
    frequent = find_freq(x_train,var,0.01)
    frequent_labels_dict[var] = frequent
    
    x_train[var] = np.where(x_train[var].isin(frequent), x_train[var], 'Rare')
    x_test[var] = np.where(x_test[var].isin(frequent), x_test[var], 'Rare')

np.save('FrequentLabels.npy', frequent_labels_dict)

In [34]:

def replace_categories(train, test, var, target):
    train = train.copy()
    test = test.copy()
    
    ordered_labels = train.groupby([var])[target].mean().sort_values().index
    ordinal_label = {k:i for i, k in enumerate(ordered_labels, 0)} 
    
    train[var] = train[var].map(ordinal_label)
    test[var] = test[var].map(ordinal_label)
    
    return ordinal_label, train, test

In [35]:
ordinal_label_dict = {}
for var in cat_vars:
    ordinal_label, x_train, x_test = replace_categories(x_train, x_test, var, 'SalePrice')
    ordinal_label_dict[var] = ordinal_label
    
# now we save the dictionary
np.save('OrdinalLabels.npy', ordinal_label_dict)

In [42]:
# check absence of na
[var for var in ft if x_train[var].isnull().sum()>0]

[]

## Feature Scaling 

In [37]:
y_train = x_train['SalePrice']
y_test = x_test['SalePrice']

In [43]:
scaler = MinMaxScaler() 
scaler.fit(x_train[ft]) 

# we persist the model for future use
joblib.dump(scaler, 'scaler.pkl')

  return self.partial_fit(X, y)


['scaler.pkl']

In [46]:
# transform the train and test set, and add on the Id and SalePrice variables
X_train = pd.DataFrame(scaler.transform(x_train[ft]), columns=ft)
X_test = pd.DataFrame(scaler.transform(x_test[ft]), columns=ft)

In [47]:
# train the model
lin_model = Lasso(alpha=0.005, random_state=0) # remember to set the random_state / seed
lin_model.fit(X_train, y_train)

# we persist the model for future use
joblib.dump(lin_model, 'lasso_regression.pkl')

['lasso_regression.pkl']

In [48]:
pred = lin_model.predict(X_train)
pred = lin_model.predict(X_test)