# Ames housing: Elastic net with sklearn

### Week 3 assignment - starter notebook


Contents
 - Start_. Import packages, create functions and load data
 - Develop a solution
  - Data manipulation - relevant code copied from 02a and 02b
  - Model
    - fit model using ElasticNetCV
    - A quick look at the coefficients
    - predictions
    - performance
 - Transfer the solution to a function
 - Test the function
 - Copy the function to a .py file

Copyright (C) 2020 Alan Chalk  
Please do not distribute or publish without permission.

## Start_.

**Import packages**

In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import ElasticNetCV, ElasticNet

import matplotlib.pyplot as plt
%matplotlib inline

**Directories and paths**

In [None]:
# Set directories
dirRawData = "../input/"
dirPData   = "../PData/"

**Functions**

In [None]:
def fn_MAE(actuals, predictions):
    return np.round(np.mean(np.abs(predictions - actuals)), 0)

def fn_RMSE(actuals, predictions):
    return np.round(np.sqrt(np.mean((predictions - actuals)**2)), 0)


**Load data**

We will just load the raw data and not do any manipulation here.  Our function will take df_all create here as input.  Whatever data manipulation we do decide to do, will become part of our function.

In [None]:
ames_dtypes = {'Order': np.int64,
             'PID': np.int64,
             'MS.SubClass': np.object,
             'MS.Zoning': np.object,
             'Lot.Frontage': np.float64,
             'Lot.Area': np.float64,
             'Street': np.object, 
             'Alley': np.object, 
             'Lot.Shape': np.object, 
             'Land.Contour': np.object,  
             'Lot.Config': np.object,
             'Land.Slope': np.object,
             'Neighborhood': np.object,
             'Condition.1': np.object,
             'Condition.2': np.object,
             'Bldg.Type': np.object,
             'House.Style': np.object,
             'Overall.Qual': np.float64,
             'Overall.Cond': np.float64,
             'Year.Built': np.float64,
             'Year.Remod.Add': np.float64,
             'Roof.Style': np.object,
             'Roof.Matl': np.object,
             'Exterior.1st': np.object,
             'Exterior.2nd': np.object,
             'Mas.Vnr.Type': np.object,
             'Mas.Vnr.Area': np.float64,
             'Exter.Qual': np.object,
             'Exter.Cond': np.object,
             'Foundation': np.object, 
             'Bsmt.Qual': np.object,
             'Bsmt.Cond': np.object,
             'Bsmt.Exposure': np.object,
             'BsmtFin.Type.1': np.object,
             'BsmtFin.SF.1': np.float64,
             'BsmtFin.Type.2': np.object,
             'BsmtFin.SF.2': np.float64,
             'Bsmt.Unf.SF': np.float64,
             'Total.Bsmt.SF': np.float64,
             'Heating': np.object,
             'Heating.QC': np.object,
             'Central.Air': np.object,
             'Electrical': np.object,
             'X1st.Flr.SF': np.float64,
             'X2nd.Flr.SF': np.float64,
             'Low.Qual.Fin.SF': np.float64,
             'Gr.Liv.Area': np.float64, 
             'Bsmt.Full.Bath': np.float64,
             'Bsmt.Half.Bath': np.float64,
             'Full.Bath': np.float64,
             'Half.Bath': np.float64,
             'Bedroom.AbvGr': np.float64,
             'Kitchen.AbvGr': np.float64,
             'Kitchen.Qual': np.object,
             'TotRms.AbvGrd': np.float64, 
             'Functional': np.object, 
             'Fireplaces': np.float64, 
             'Fireplace.Qu': np.object,
             'Garage.Type': np.object, 
             'Garage.Yr.Blt': np.float64,
             'Garage.Finish': np.object,
             'Garage.Cars': np.float64,
             'Garage.Area': np.float64, 
             'Garage.Qual': np.object, 
             'Garage.Cond': np.object, 
             'Paved.Drive': np.object,
             'Wood.Deck.SF': np.float64,
             'Open.Porch.SF': np.float64,
             'Enclosed.Porch': np.float64, 
             'X3Ssn.Porch': np.float64,
             'Screen.Porch': np.float64,
             'Pool.Area': np.float64,
             'Fence': np.object,
             'Misc.Feature': np.object,
             'Misc.Val': np.float64,
             'Mo.Sold': np.float64, 
             'Yr.Sold': np.float64,
             'Sale.Type': np.object,
             'Sale.Condition': np.object,
             'SalePrice': np.float64}


In [None]:
df_all = pd.read_csv(dirRawData + 'AmesHousing.txt', 
                     sep=" ",
                     dtype = ames_dtypes,
                     na_values = 'NA')

### Work from here onwards will become part of our function

**Data manipulation**

In [None]:
# change column names to closer to camel case
import re
def convert(name):
    s1 = re.sub('\.', '_', name)
    return s1.lower()

colnames = df_all.columns.values
colnames = list(map(convert, colnames))
df_all.columns = colnames
del convert, colnames

In [None]:
# define variables
vars_all = df_all.columns.values
var_dep = ['saleprice']

vars_notToUse = ['order', 'pid']
vars_ind = [var for var in vars_all if var not in (vars_notToUse + var_dep)]
vars_ind_numeric = list(df_all[vars_ind].columns[df_all[vars_ind].dtypes != 'object'])

In [None]:
# Deal with missings as per 02a
vars_toDrop = ['lot_frontage', 'garage_yr_blt', 'mas_vnr_area']
df_all.drop(labels=vars_toDrop,
            axis=1,
            inplace=True)

vars_ind = [var for var in vars_ind if var not in vars_toDrop]
vars_ind_numeric = [var for var in vars_ind_numeric if var not in vars_toDrop]
df_all.dropna(inplace = True)

In [None]:
# remove outliers
df_all = df_all[df_all['gr_liv_area'] <= 4000]
df_all.reset_index(drop=True, inplace=True)

In [None]:
# create onehot columns
vars_ind_categorical = df_all.columns[df_all.dtypes == 'object'].tolist()
vars_ind_onehot = []

df_all_onehot = df_all.copy()

for col in vars_ind_categorical:
    
    # use pd.get_dummies on  df_all[col] 
    df_oh = pd.get_dummies(df_all[col], drop_first=False)
    
    # Find the name of the most frequent column 
    col_mostFreq = df_oh.sum(axis = 0).idxmax()

    # Drop the column of the most frequent category (using df_oh.drop)
    df_oh = df_oh.drop(col_mostFreq, axis=1)
            
    # Rename the columns to have the original variable name as a prefix
    oh_names = col + '_' + df_oh.columns
    df_oh.columns = oh_names
    
    df_all_onehot = pd.concat([df_all_onehot, df_oh], axis = 1, sort = False)

    del df_all_onehot[col]
    vars_ind_onehot.extend(oh_names)


In [None]:
# create fold
rng = np.random.RandomState(2018)
fold = rng.randint(0, 10, df_all.shape[0])
df_all_onehot['fold'] = fold

In [None]:
# rename df_all_onehot to df_all as this is now the data we will be using for 
# the rest of this work
df_all = df_all_onehot
del df_all_onehot

Now we define indices for train, val, design and test.  You should get the same numbers in each as in the comments below.  If you do not, then something has gone wrong and you should ask on Moodle.

In [None]:
# define index for train, val, design, test
idx_train  = np.where(df_all['fold'].isin(np.arange(0,6)))[0]
idx_val    = np.where(df_all['fold'].isin([6,7]))[0]
idx_design = np.where(df_all['fold'].isin(np.arange(0,8)))[0]
idx_test   = np.where(df_all['fold'].isin([8,9]))[0]

print(len(idx_train))  # 1,749
print(len(idx_val))    #   586
print(len(idx_design)) # 2,335

**Standardise**

As per our discussion on Moodle, we are giving each of our features a mean of 0 and standard deviation of 1.  This is the same as "standard" normal distribution, hence (presumably?) the name standardise.

In [None]:
for var in vars_ind_numeric:
    x = df_all[var].values
    x -= np.mean(x, axis=0)
    x /= np.sqrt(np.mean(x ** 2, axis=0))
    df_all[var] = x

**Prepare basis functions**

I encourage you to experiment below with the variables that are splined (currently set to minimum cardinality > 8) and the percentiles used (currently set to [10, 20, 40, 60, 80, 90]).  However if you are short of time, the settings below should give a reasonable answer.

In [None]:
vars_ind_tospline = df_all[vars_ind_numeric].columns[(df_all[vars_ind_numeric].nunique() > 8)].tolist()

def fn_tosplines(x):
    x = x.values
    # hack: remove zeros to avoid issues where lots of values are zero
    x_nonzero = x[x != 0]
    ptiles = np.percentile(x_nonzero, [10, 20, 40, 60, 80, 90])
    #print(var, ptiles)
    df_ptiles = pd.DataFrame({var: x})
    for idx, ptile in enumerate(ptiles):
        df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx])
    return(df_ptiles)

In [None]:
for var in vars_ind_tospline:
    df_ptiles = fn_tosplines(df_all[var])
    df_all.drop(columns=[var], inplace=True)
    vars_ind_numeric.remove(var)
    df_all = pd.concat([df_all, df_ptiles], axis=1, sort=False)
    vars_ind_numeric.extend(df_ptiles.columns.tolist())

In [None]:
vars_ind = vars_ind_onehot + vars_ind_numeric

In [None]:
X = df_all[vars_ind].values
y = df_all[var_dep].values

X_train  = X[idx_train, :]
X_val    = X[idx_val, :]
X_design = X[idx_design, :]
X_test   = X[idx_test, :]

y_train  = df_all[var_dep].iloc[idx_train].copy().values.ravel()
y_val    = df_all[var_dep].iloc[idx_val].copy().values.ravel()
y_design = df_all[var_dep].iloc[idx_design].copy().values.ravel()
y_test   = df_all[var_dep].iloc[idx_test].copy().values.ravel()

### Elastic net with CV

Below I have set up the code to test one given value of l1_ratio.  You should type in a value, run through the code, experiment a little with lambda and see the performance.  You should then do the same for difference values of l1_ratio.  The values I tested for l1_ratio are [0, 0.25, 0.5, 0.75, 0.9, 0.99]

I did not change the list for "alphas" in any of my tests.  This is not idea.  But I find that sklearn ElasticNet implementations tend to fail to converge when it does not "like" what you put in - since these worked and gave a semi-decent answer, I left it like that.

**Instantiate and fit the model**

In [None]:
%%time
# below, alpha is regularisation strength
# note impact of selection='random' vs cyclical on time taken
#del enCV_

enCV_ = ElasticNetCV(
                     # type in whatever value you are testing here
                     l1_ratio=#type value here
    
                     ,alphas=[2**num for num in range(-6,5)]
                     # if you get non-convergence, you many need to increase max_iter
                     ,max_iter=5000 
                     # we already normalised but you may get a better answer if 
                     # you turn this on.  You should get a different answer at least
                     # since we did not normalise the splines (as discussed on Moodle)
                     ,normalize=False
                     ,cv=10
                     ,random_state=2018
                     ,selection='random'
                     )

enCV_.fit(X=X_design, y=y_design)

Inspect the coefficients

In [None]:
df_coef = pd.DataFrame({'variable': vars_ind, 'coefficient': enCV_.coef_})
df_coef = df_coef[df_coef['coefficient'] != 0]
print("Total number of coefficients: ", df_coef.shape[0])
df_coef['sign'] = np.where(df_coef['coefficient'].values < 0, 'NEG', 'POS')
df_coef['coefficient_abs'] = np.abs(df_coef['coefficient'])
print("Total number of non-zero coefficients: ", df_coef.shape[0])

print("Largest coefficients...")
df_coef.sort_values('coefficient_abs', ascending=False, inplace=True)
df_coef.head()

**Regularisation strength**

Find the regularisation stregnth chosen by sklearns CV.  Please remember that the correct name for regularisation stregnth is lambda.  sklearn calls it alpha just to avoid confusion with the lambda keyword in Python.  When we move next week to H2O, you will see it called lambda_ (with an underscore).

What did sklearn choose?

In [None]:
print(enCV_.alpha_)
print(np.log10(enCV_.alpha_))

Now graph the validation curves, if val performance is very flat near the value that sklearn chose, this may indicate that we can increase the value, get a simpler model with less overfitting but still do OK on test.

Note that these validation curves are based on MSE instead of MAE, I used that too.  This is not ideal.

In [None]:
m_log_lambdas = -np.log10(enCV_.alphas_)

font = {'size': 20}
plt.rc('font', **font)

fig = plt.figure(figsize=(10,6))
ax1 = fig.add_subplot(1, 1, 1)
ax1.plot(m_log_lambdas, enCV_.mse_path_, ':')
ax1.plot(m_log_lambdas, enCV_.mse_path_.mean(axis=-1),
         'k',
         label='Average across the folds',
         linewidth=2)

ax1.axvline(-np.log10(enCV_.alpha_),
            linestyle='--', 
            color='k',
            label='alpha: CV estimate')

ax1.legend()

ax1.set_xlabel('-log10(lambda)')
ax1.set_ylabel('Mean square error')
ax1.set_title('Mean square error on each fold')
_ = ax1.axis('tight')

Write down the best regularisation stregth chosen by sklearn and also one or two higher ones for you to test if you think you might improve your model...

Be careful when reading off the x-axis it is MINUS log10 of alpha.

**Performance over test**

We will try out some different choices on train-val data.

In real life you cannot fit models on test data. Also in Kaggle competitions you cannot do this.  Test data they give you (which is called the public leadboard data) is not all of the test data, so if you overfit to it, you will do very poorly in the final scoring over the rest of the test data (which is called the private leaderboard). 

However, when finding the best solution, I did "cheat" by looking a few times at models on all of design data and seeing performance on test.  The code below is more correct, but at least for this assignment when you get to checking performance on test, you should try a few of your best models.

So, type in whatever l1_ratio you are testing and then some values for alpha.  See which does best and write it down

In [None]:
en_ = ElasticNet(alpha=#type value here
                 ,l1_ratio=#type value here
                 ,normalize=False
                 ,random_state=2018
                 ,selection='random'
                 ,max_iter=5000
                 )

en_.fit(X=X_train, y=y_train)

In [None]:
pred_train  = enCV_.predict(X_train)
pred_val    = enCV_.predict(X_val)
#pred_test   = enCV_.predict(X_test)

In [None]:
print("MAE: train:", fn_MAE(y_train, pred_train))
print("MAE: val:", fn_MAE(y_val, pred_val))
print(fn_MAE(y_val,   pred_val) - fn_MAE(y_train, pred_train))
#print("MAE: design:", fn_MAE(y_design, pred_design))
#print("MAE: test:",   fn_MAE(y_test,   pred_test))
#print(fn_MAE(y_test,   pred_test) - fn_MAE(y_design, pred_design))
# standardise everything after splines created gave similar results

**Final model**

Repeat the above process for a few different values of regularisation strength and l1_ratio. There is notthing to stop you trying a few different values here.  Though as discussed above it is bad practice in general.

In [None]:
en_ = ElasticNet(alpha=#your choice here
                 ,l1_ratio=#your choice here
                 ,normalize=False
                 ,random_state=2018
                 ,selection='random'
                 ,max_iter=5000
                 )

en_.fit(X=X_design, y=y_design)

In [None]:
pred_design = en_.predict(X_design)
pred_test   = en_.predict(X_test)

In [None]:
print(fn_MAE(y_design, pred_design))
print(fn_MAE(y_test,   pred_test))
print(fn_MAE(y_test,   pred_test) - fn_MAE(y_design, pred_design))

### Create a function

Once you have something you are comfortable with, copy all of the code you used to create the models into the function below (making sure it is correctly indented.  To avoid you having to edit the code, we keep the name of the data passed to the function as df_all.

In [None]:
def fn_ames_en(df_all):
    
    import re
    import numpy as np
    import pandas as pd
    import pickle
    
    from sklearn.linear_model import ElasticNetCV, ElasticNet
    
    import matplotlib.pyplot as plt

    def convert(name):
        s1 = re.sub('\.', '_', name)
        return s1.lower()

    def fn_MAE(actuals, predictions):
        return np.round(np.mean(np.abs(predictions - actuals)), 0)

    def fn_tosplines(x):
        x = x.values
        # hack: remove zeros to avoid issues where lots of values are zero
        x_nonzero = x[x != 0]
        ptiles = np.percentile(x_nonzero, [10, 20, 40, 60, 80, 90])
        #print(var, ptiles)
        df_ptiles = pd.DataFrame({var: x})
        for idx, ptile in enumerate(ptiles):
            df_ptiles[var + '_' + str(idx)] = np.maximum(0, x - ptiles[idx])
        return(df_ptiles)

    
    # change column names to closer to camel case
    colnames = df_all.columns.values
    colnames = list(map(convert, colnames))
    df_all.columns = colnames
    del convert, colnames
    
    # define variables
    vars_all = df_all.columns.values
    var_dep = ['saleprice']
    
    vars_notToUse = ['order', 'pid']
    vars_ind = [var for var in vars_all if var not in (vars_notToUse + var_dep)]
    vars_ind_numeric = list(df_all[vars_ind].columns[df_all[vars_ind].dtypes != 'object'])
    
    # Deal with missings as per 02a
    vars_toDrop = ['lot_frontage', 'garage_yr_blt', 'mas_vnr_area']
    df_all.drop(labels=vars_toDrop,
                axis=1,
                inplace=True)
    
    vars_ind = [var for var in vars_ind if var not in vars_toDrop]
    vars_ind_numeric = [var for var in vars_ind_numeric if var not in vars_toDrop]
    df_all.dropna(inplace = True)
    
    # remove outliers
    df_all = df_all[df_all['gr_liv_area'] <= 4000]
    df_all.reset_index(drop=True, inplace=True)
    
    # create onehot columns
    vars_ind_categorical = df_all.columns[df_all.dtypes == 'object'].tolist()
    vars_ind_onehot = []
    
    df_all_onehot = df_all.copy()
    
    for col in vars_ind_categorical:   
        # use pd.get_dummies on  df_all[col]
        df_oh = pd.get_dummies(df_all[col], drop_first=False) 
        # Find the name of the most frequent column
        col_mostFreq = df_oh.sum(axis = 0).idxmax()
        # Drop the column of the most frequent category (using df_oh.drop)
        df_oh = df_oh.drop(col_mostFreq, axis=1)
        # Rename the columns to have the original variable name as a prefix
        oh_names = col + '_' + df_oh.columns
        df_oh.columns = oh_names
        df_all_onehot = pd.concat([df_all_onehot, df_oh], axis = 1, sort = False)
        del df_all_onehot[col]
        vars_ind_onehot.extend(oh_names)
        
    # create fold
    rng = np.random.RandomState(2018)
    fold = rng.randint(0, 10, df_all.shape[0])
    df_all_onehot['fold'] = fold
        
    # rename df_all_onehot to df_all as this is now the data we will be using for
    # the rest of this work
    df_all = df_all_onehot
    del df_all_onehot
        
    # define index for train, val, design, test
    idx_train  = np.where(df_all['fold'].isin(np.arange(0,6)))[0]
    idx_val    = np.where(df_all['fold'].isin([6,7]))[0]
    idx_design = np.where(df_all['fold'].isin(np.arange(0,8)))[0]
    idx_test   = np.where(df_all['fold'].isin([8,9]))[0]
   
    # standardise features
    for var in vars_ind_numeric:
        x = df_all[var].values
        x -= np.mean(x, axis=0)
        x /= np.sqrt(np.mean(x ** 2, axis=0))
        df_all[var] = x
            
    vars_ind_tospline = df_all[vars_ind_numeric].columns[(df_all[vars_ind_numeric].nunique() > 8)].tolist()
            
    for var in vars_ind_tospline:
        df_ptiles = fn_tosplines(df_all[var])
        df_all.drop(columns=[var], inplace=True)
        vars_ind_numeric.remove(var)
        df_all = pd.concat([df_all, df_ptiles], axis=1, sort=False)
        vars_ind_numeric.extend(df_ptiles.columns.tolist())
                
    vars_ind = vars_ind_onehot + vars_ind_numeric
                
    X = df_all[vars_ind].values
    y = df_all[var_dep].values
                
    X_design = X[idx_design, :]
    X_test   = X[idx_test, :]
    y_design = df_all[var_dep].iloc[idx_design].copy().values.ravel()
    y_test   = df_all[var_dep].iloc[idx_test].copy().values.ravel()
                
    X = df_all[vars_ind].values
    y = df_all[var_dep].values
                
    X_train  = X[idx_train, :]
    X_val    = X[idx_val, :]
    X_design = X[idx_design, :]
    X_test   = X[idx_test, :]
                
    y_train  = df_all[var_dep].iloc[idx_train].copy().values.ravel()
    y_val    = df_all[var_dep].iloc[idx_val].copy().values.ravel()
    y_design = df_all[var_dep].iloc[idx_design].copy().values.ravel()
    y_test   = df_all[var_dep].iloc[idx_test].copy().values.ravel()
                
    # Copy enough of your ElasticNetCV code here so that I can see one of your experiments
    # and get an idea of the method you used to tune the hyper parameters

        
    # Now copy the code for your final model here
    en_ = ElasticNet()
    
    en_ = en_.fit()

    pred_design  =
    pred_test    =
                
    # calculate MAE on test and non test but then hard code in the return statement
    mae_design =
    mae_test =
                
    return en_, X, y, hard code your mae_design here eg 14123, also mae_test eg 13321


Once you have created your function, copy it to a plain text .py file and save it in your PCode directory. Now if you have time, test it with the test notebook provided