
# Basic Overview
The objective is to build a linear regression model by adding and transforming predictors step by step.

Comments/criticisms/appreciations are greatly accepted and appreciated. Do not be shy and send me an email at babinu@gmail.com !

Source of data : https://www.kaggle.com/c/house-prices-advanced-regression-techniques

In [6]:
import pandas as pd
import numpy as np
import sys    
sys.path.append('../../common_routines/')
import numpy as np
from relevant_functions import\
    evaluate_model_score_given_predictions,\
    evaluate_model_score,\
    evaluate_neg_model_score,\
    cross_val_score_given_model,\
    fit_pipeline_and_cross_validate, \
    fit_pipeline_and_evaluate_on_validation_set, \
    print_model_stats_from_pipeline, \
    get_validated_transformed_data

In [7]:
complete_train_data = pd.read_csv("../../input/train.csv")
test_data = pd.read_csv("../../input/test.csv")

In [8]:
# Dump alll the dataframes with one hot encoding.
train_data_one_hot = pd.read_csv('../../cleaned_input/train_data_one_hot.csv')
validation_data_one_hot = pd.read_csv('../../cleaned_input/validation_data_one_hot.to_csv')
test_data_one_hot =pd.read_csv('../../cleaned_input/test_data_one_hot.csv')

# Dump the data frames prior to taking the one hot encoding transformation.
# Remember that we had handled the null values at the stage and hence the model
# does not need to worry about the same.
train_data = pd.read_csv('../../cleaned_input/train_data.csv')
validation_data = pd.read_csv('../../cleaned_input/validation_data.csv')


### Building simple models.

Now that we have finally taken care of data, let us go to the next step by building simple models. This way, we will be able to appreciate the contribution of every variable and will learn step by step model building !!!

#### We build a model after transforming response.

We could build a linear model predicting the sale price directly and then judge how good it is, by evaluating using the logarithm of the sale price. Alternatively, we could take the logarithm of the sale price at first and use that as the predictor variable. 

We will be using the second approach as that looks more appropriate from the data plots for SalePrice.

#### Add LogMiscVal as predictor

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn import linear_model
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot , 
    X_columns=['LogMiscVal'])[1] 

0.4022329940035444

In [10]:
train_data_one_hot['LogMiscVal']

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
5       6.552508
6       0.000000
7       5.860786
8       0.000000
9       0.000000
10      0.000000
11      0.000000
12      0.000000
13      0.000000
14      0.000000
15      0.000000
16      6.552508
17      6.216606
18      0.000000
19      0.000000
20      0.000000
21      0.000000
22      0.000000
23      0.000000
24      0.000000
25      0.000000
26      0.000000
27      0.000000
28      0.000000
29      0.000000
          ...   
1065    0.000000
1066    0.000000
1067    0.000000
1068    0.000000
1069    0.000000
1070    0.000000
1071    0.000000
1072    0.000000
1073    0.000000
1074    0.000000
1075    0.000000
1076    6.216606
1077    0.000000
1078    0.000000
1079    0.000000
1080    0.000000
1081    0.000000
1082    0.000000
1083    6.552508
1084    0.000000
1085    0.000000
1086    0.000000
1087    0.000000
1088    0.000000
1089    0.000000
1090    0.000000
1091    0.000000
1092    0.0000

#### Add LogGrLivArea as predictor

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn import linear_model
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea'])[1]

0.27161716540345976

### Checking out categorical variables.

Rather than dumping all the one hot encoded values of a categorical variable to the model, we try a slightly different approach here. 

We group the saleprices per category and try to see which of the categories , would result in a high average sale price. Depending on that we would come up with a synthetic indicator  variable that would be set to 1, if the category belonged to the one corresponding to the  high saleprice and 0 otherwise.

#### Column : MSSubClass

In [12]:
def get_mean_count_per_group(train_data, group_col):
    results_df = pd.DataFrame(train_data[['SalePrice', group_col]].groupby([group_col]).size())
    results_df['mean_SalePrice'] =  train_data[['SalePrice', group_col]].groupby(group_col).SalePrice.mean()
    results_df['mean_LogSalePrice'] =  train_data[['LogSalePrice', group_col]].groupby(group_col).LogSalePrice.mean()

  
    # Added later one.
    if 'LogSalePricePerSqFeet' in train_data.columns:
        results_df['mean_LogSalePricePerSqFeet'] =  \
            train_data[['LogSalePricePerSqFeet', group_col]].groupby(group_col).LogSalePricePerSqFeet.mean()    
        results_df.columns = ['Count', 'mean_SalePrice', 'mean_LogSalePrice', 'mean_LogSalePricePerSqFeet']
    else:
        results_df.columns = ['Count', 'mean_SalePrice', 'mean_LogSalePrice']
    results_df['percent_total_size'] = results_df['Count'] * 100.0/len(train_data)
    return results_df

In [13]:
results_df = get_mean_count_per_group(train_data, 'MSSubClass')

In [14]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
MSSubClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
60,232,241847.021552,12.349193,21.187215
75,14,200392.857143,12.099105,1.278539
120,68,198945.705882,12.16666,6.210046
20,403,184686.218362,12.048144,36.803653
80,38,168185.526316,12.009529,3.47032
70,41,166279.463415,11.949679,3.744292
85,15,151113.333333,11.91859,1.369863
50,100,140537.78,11.80006,9.13242
160,49,138363.938776,11.804102,4.474886
90,39,133278.897436,11.779705,3.561644


In [15]:
def transform_column_train_validation_test(train_data_one_hot,
                                           validation_data_one_hot,
                                           test_data_one_hot,
                                           target_col,
                                           source_cols):

    train_data_one_hot[target_col] = 0
    validation_data_one_hot[target_col] = 0
    test_data_one_hot[target_col] = 0
    
    for col in source_cols:
        train_data_one_hot[target_col] += train_data_one_hot[col]
        validation_data_one_hot[target_col] += validation_data_one_hot[col]
        test_data_one_hot[target_col] += test_data_one_hot[col]        


In [16]:
transform_column_train_validation_test(train_data_one_hot, validation_data_one_hot, test_data_one_hot,
                                      'MSSubClass_60_75_120_20', 
                                      ['MSSubClass_120', 'MSSubClass_60', 'MSSubClass_20', 'MSSubClass_75'])

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn import linear_model
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20'])[1]

0.23998906193907085

#### Another thought ?

Someone might claim that the average sale price for that zone (in logarithmic terms) would measure the importance of that zone and hence that might be a better indicator than the one used above. Let us see if that is the case.

NOTE : I would expect this to be an inferior indicator than the one which we used above, since the values simply look too close to each other and do not look to capture the differences in subclasses sufficiently.

In [18]:
subclass_to_LogSalePrice = dict(zip(results_df.index, results_df.mean_LogSalePrice)) 

In [19]:
subclass_to_LogSalePrice

{20: 12.04814412706041,
 30: 11.43668923670398,
 40: 11.671084169224912,
 45: 11.579032983343444,
 50: 11.800059519050276,
 60: 12.349192704221739,
 70: 11.949678887429084,
 75: 12.09910527148951,
 80: 12.009529285935356,
 85: 11.918589623636617,
 90: 11.779705171526395,
 120: 12.166660247259609,
 160: 11.80410228120793,
 180: 11.407723715624837,
 190: 11.743054692745964}

In [20]:
train_data['MSSubClass_Val'] = train_data['MSSubClass'].apply (lambda x : subclass_to_LogSalePrice.get(x))

In [21]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 'MSSubClass_Val'])[1]

0.25140820156722005

Comment : As expected , this variable does not look to give us a better result and hence we stick with the earlier one.

#### Column : MSZoning

In [22]:
results_df = get_mean_count_per_group(train_data, 'MSZoning')

In [23]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
MSZoning,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FV,53,216483.132075,12.256738,4.840183
RL,859,191861.142026,12.090554,78.447489
RH,12,133994.5,11.771509,1.09589
RM,162,123783.141975,11.671673,14.794521
C (all),9,75208.888889,11.116607,0.821918


In [24]:
transform_column_train_validation_test(train_data_one_hot,
                                       validation_data_one_hot,
                                       test_data_one_hot,
                                       'MSZoning_FV_RL',
                                       ['MSZoning_FV', 'MSZoning_RL'])


In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn import linear_model
fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'MSZoning_FV_RL'])[1]

0.23218002510204933

#### Add OverallQual


In [26]:
results_df = get_mean_count_per_group(train_data, 'OverallQual')

In [27]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
OverallQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,12,426924.25,12.915566,1.09589
9,37,370448.594595,12.798459,3.378995
8,125,276423.744,12.505475,11.415525
7,242,208365.752066,12.224535,22.100457
6,279,161053.229391,11.964026,25.479452
5,298,132138.312081,11.769357,27.214612
4,83,108032.373494,11.552726,7.579909
3,15,89298.333333,11.352054,1.369863
1,2,50150.0,10.798804,0.182648
2,2,47655.5,10.737025,0.182648


In [28]:
fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual'])[1]

0.18337479641918253

#### Add OverallCond


In [29]:
results_df = get_mean_count_per_group(train_data, 'OverallCond')

In [30]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
OverallCond,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9,14,232800.0,12.281401,1.278539
5,616,204137.683442,12.149918,56.255708
8,56,155671.875,11.913001,5.114155
7,151,155629.172185,11.90348,13.789954
6,192,154531.083333,11.887697,17.534247
2,5,141986.4,11.593353,0.456621
4,41,124027.609756,11.677698,3.744292
3,19,98333.421053,11.392669,1.73516
1,1,61000.0,11.018629,0.091324


In [31]:
fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual', 'OverallCond'])[1]

0.18081723445902273

Comment : Can't we combine Qverall quality and condition variables ? Is it necessary to have 2 separate variables here ?

We do look to be some predictive power here, so let us keep both of them. However, at some point in our model building, we may find that the predictive power associated with OverallCond is actually given by other variables as well and hence we do not need it any longer.

#### Add column Neighbourhood

In [32]:
results_df = get_mean_count_per_group(train_data, 'Neighborhood')

In [33]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NridgHt,64,321781.359375,12.637217,5.844749
StoneBr,22,320657.954545,12.617717,2.009132
NoRidge,27,318553.333333,12.637571,2.465753
Timber,28,243325.964286,12.368274,2.557078
Veenker,8,240062.5,12.340842,0.730594
Somerst,65,227413.692308,12.303509,5.936073
ClearCr,24,214159.666667,12.245548,2.191781
Crawfor,34,199796.294118,12.156602,3.105023
CollgCr,114,195479.008772,12.15186,10.410959
Blmngtn,14,194023.357143,12.165725,1.278539


In [34]:
train_data['SalePrice'].describe()

count      1095.000000
mean     181388.136986
std       79728.891687
min       34900.000000
25%      130000.000000
50%      163000.000000
75%      214250.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [35]:
transform_column_train_validation_test(train_data_one_hot, validation_data_one_hot, test_data_one_hot,
                                       'Neighbourhood_Good',
                                       ['Neighborhood_NridgHt', 'Neighborhood_StoneBr', 
                                        'Neighborhood_NoRidge', 'Neighborhood_Timber', 
                                        'Neighborhood_Veenker', 'Neighborhood_Somerst', 
                                        'Neighborhood_ClearCr', 'Neighborhood_Crawfor', 
                                        'Neighborhood_CollgCr', 'Neighborhood_Blmngtn',
                                        'Neighborhood_NWAmes', 'Neighborhood_Gilbert', 'Neighborhood_SawyerW'])

In [36]:
fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual', 'OverallCond', 'Neighbourhood_Good'])[1]

0.17366550584852522

Comment : Again , does taking the logarithm of mean of the selling price for each neighbourhood and using that as the indicator value for each neighbourhood help ?


In [37]:
neighbourhood_to_LogSalePrice = dict(zip(results_df.index, results_df.mean_LogSalePrice)) 

In [38]:
def transform_column_train_validation_test_from_dict(train_data,
                                                     validation_data,
                                                     test_data,
                                                     train_data_one_hot,
                                                     validation_data_one_hot,
                                                     test_data_one_hot,
                                                     target_col,
                                                     source_col,
                                                     given_dict):  
  
    train_data[target_col] = train_data[source_col].apply (lambda x : given_dict.get(x))
    validation_data[target_col] = validation_data[source_col].apply (lambda x : given_dict.get(x))
    test_data[target_col] = test_data[source_col].apply (lambda x : given_dict.get(x))    

    train_data_one_hot[target_col] = train_data[target_col]
    validation_data_one_hot[target_col] = validation_data[target_col]
    test_data_one_hot[target_col] = test_data[target_col]


In [39]:
transform_column_train_validation_test_from_dict(train_data,
                                                 validation_data,
                                                 test_data,
                                                 train_data_one_hot,
                                                 validation_data_one_hot,
                                                 test_data_one_hot,
                                                 'Neighborhood_Val',
                                                 'Neighborhood',
                                                 neighbourhood_to_LogSalePrice)                                                 


In [40]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual', 'OverallCond', 'Neighborhood_Val'])

In [41]:
cross_validation_score

0.16071596412853448

Comment : This was a pleasant surprice. Let us delve into this more deeply.


In [42]:
my_pipeline

Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

In [43]:
my_model = my_pipeline.named_steps['linearregression']

In [44]:
my_model.intercept_

2.9241571949922722

In [45]:
train_data_one_hot['LogSalePrice'].mean()

12.02512502918356

In [46]:
my_model.coef_

array([0.45268743, 0.10927628, 0.09974205, 0.04102905, 0.40766989])

Comment : From first glance ,we can see the following :

(a) The magnitude of coefficients is much higher for LogGrLivArea and Neighborhood_Val , possibly because they are in the same logarithmic response scale as the predictor variable.
(b) The intercept term is roughly 25% of the predictor variable, indicating that there is lot of improvement possible.

However, there is an important caveat here. We have used the average values across the entire data, while doing cross validation, and hence the hold out sets used during cross validation are not 'fully held out' sets in strictest terms. So, before we proceed further, we need to make sure the performance is more or less the same, or completely held out validation set.

In [47]:
(my_pipeline, validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual', 'OverallCond', 'Neighborhood_Val'])

In [48]:
validation_score

0.1624076804255766

Comment : As expected, we see a slight bump in the score, but thankfully it is still in the ballpark

### Question regarding OverallQual and OverallCond ?

Now, that we have seen how dramatically the model performance improved when we used the averge home prices as the value for a neighbourhood, can we do the same for OverallQual and OverallCond ? 

Putting it more clearly, rather than directly using the numerical values of these variables, can't we use the mean log sale price grouped per value of OverallQual (or OverallCond) and use that as a proxy for quality/ condition ?

Let us see if this gives us any improvement.

In [49]:
results_df = get_mean_count_per_group(train_data, 'OverallQual')

In [50]:
results_df

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
OverallQual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,50150.0,10.798804,0.182648
2,2,47655.5,10.737025,0.182648
3,15,89298.333333,11.352054,1.369863
4,83,108032.373494,11.552726,7.579909
5,298,132138.312081,11.769357,27.214612
6,279,161053.229391,11.964026,25.479452
7,242,208365.752066,12.224535,22.100457
8,125,276423.744,12.505475,11.415525
9,37,370448.594595,12.798459,3.378995
10,12,426924.25,12.915566,1.09589


In [51]:
overallQual_to_LogSalePrice = dict(zip(results_df.index, results_df.mean_LogSalePrice)) 

In [52]:
transform_column_train_validation_test_from_dict(train_data,
                                                 validation_data,
                                                 test_data,
                                                 train_data_one_hot,
                                                 validation_data_one_hot,
                                                 test_data_one_hot,
                                                 'OverallQual_Val',
                                                 'OverallQual',
                                                 overallQual_to_LogSalePrice)                                                 


In [53]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual_Val', 'OverallCond', 'Neighborhood_Val'])
print(cross_validation_score)

0.1593692076533335


Comment : That looked decent. Let us see how it performs on the validation set.

In [54]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual_Val', 'OverallCond', 'Neighborhood_Val'])
print(cross_validation_score)

0.1620311693797422


Comment: This is rather miniscule on the validation set. Let us try the same with OverallCond

In [55]:
transform_column_train_validation_test_from_dict(train_data,
                                                 validation_data,
                                                 test_data,
                                                 train_data_one_hot,
                                                 validation_data_one_hot,
                                                 test_data_one_hot,
                                                 'OverallCond_Val',
                                                 'OverallCond',
                                                 overallQual_to_LogSalePrice)                                                 


In [56]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual_Val', 'OverallCond', 'Neighborhood_Val'])
print(cross_validation_score)

0.1593692076533335


In [57]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual_Val', 'OverallCond', 'Neighborhood_Val'])
print(cross_validation_score)

0.1620311693797422


Comment : As expected the results are a lot better while we do cross validation, since we are using the mean values obtained from the same sample, while the improvement in real out of sample testing is scarce. 

One reason is that OverallQual already was a numeric variable, and hence making it further numeric(in a sense, by using mean values) does not give us any real benefit. To make things simple, let us stick with the original numeric value.

#### Type of road access.

This should have an obvious effect on the home price. Let us see if that is the case !

In [58]:
rel_x_cols = ['LogGrLivArea', 'MSSubClass_60_75_120_20', 'OverallQual', 'OverallCond', 'Neighborhood_Val', 'Street_Grvl']

In [59]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16159832465959648


In [60]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16237566357698213


Comment : Does not provide us any benefit. Let us first have a look at the data (should have done this initially !)

In [61]:
results_df = get_mean_count_per_group(train_data, 'Street')

In [62]:
results_df

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
Street,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Grvl,5,118888.6,11.57347,0.456621
Pave,1090,181674.83211,12.027197,99.543379


Comment : This illustrates the point. The number of homes having street as Grvl is extremely small and hence this help us much in predictions !

#### Home functionality.

This is basic !!

In [63]:
results_df = get_mean_count_per_group(train_data, 'Functional')

In [64]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
Functional,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Typ,1017,184071.882006,12.041407,92.876712
Mod,12,167333.333333,11.822068,1.09589
Maj1,10,155377.4,11.842128,0.913242
Min2,25,149167.28,11.875336,2.283105
Min1,25,142798.0,11.835228,2.283105
Sev,1,129000.0,11.767568,0.091324
Maj2,5,85800.0,11.316555,0.456621


Again, we do not see possibility of much of a benefit as more than 92% of the homes have typical functionality.However , let us check if we can get something out of it.

In [65]:
rel_x_cols = ['LogGrLivArea', 
              'MSSubClass_60_75_120_20', 
              'OverallQual', 
              'OverallCond', 
              'Neighborhood_Val', 
              'Functional_Typ']

In [66]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16028394663283646


In [67]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16218223743099083


Not much of benefit and hence moving on ....

#### Check out LotShape

Let us check if this helps !!!

In [68]:
results_df = get_mean_count_per_group(train_data, 'LotShape')

In [69]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
LotShape,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
IR2,36,239407.444444,12.30804,3.287671
IR3,4,219625.0,12.133117,0.365297
IR1,364,203873.071429,12.153877,33.242009
Reg,691,166299.629522,11.941938,63.105023


Okay, there looks to be something here. There looks to be a clear demarcation between regular lots and others. Let us check if this adds any predictive power here.

In [70]:
rel_x_cols = ['LogGrLivArea', 
              'MSSubClass_60_75_120_20', 
              'OverallQual', 
              'OverallCond', 
              'Neighborhood_Val', 
              'LotShape_Reg']

In [71]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16007734733870088


In [72]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16219009333663442


Okay, now that we do not see much of a benefit, let us see if creating synthetic indicator with grouped mean values, as we did for Neighborhood variable would help.

In [73]:
lotShape_to_LogSalePrice = dict(zip(results_df.index, results_df.mean_LogSalePrice)) 

In [74]:
transform_column_train_validation_test_from_dict(train_data,
                                                 validation_data,
                                                 test_data,
                                                 train_data_one_hot,
                                                 validation_data_one_hot,
                                                 test_data_one_hot,
                                                 'LotShape_Val',
                                                 'OverallCond',
                                                 lotShape_to_LogSalePrice)                                                 


In [75]:
rel_x_cols = ['LogGrLivArea', 
              'MSSubClass_60_75_120_20', 
              'OverallQual', 
              'OverallCond', 
              'Neighborhood_Val', 
              'LotShape_Val']

In [76]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16071596412853448


In [77]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.1624076804255766


The previous indicator was better than this one. There looks a little bit of predictive power here, but not good enough to elicit our interest. Hence, let us move on !

#### Land Contour !

In [78]:
results_df = get_mean_count_per_group(train_data, 'LandContour')

In [79]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
LandContour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HLS,39,220733.564103,12.210646,3.561644
Low,28,204135.714286,12.09917,2.557078
Lvl,986,180678.94929,12.023793,90.045662
Bnk,42,146337.071429,11.834757,3.835616


Most of properties are flat, and hence we do not be getting much sense from the use of this data !

#### Utilities 

In [80]:
results_df = get_mean_count_per_group(train_data, 'Utilities')

In [81]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
Utilities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AllPub,1094,181428.254113,12.025302,99.908676
NoSeWa,1,137500.0,11.831379,0.091324


The conclusion is obvious !

In [82]:
results_df = get_mean_count_per_group(train_data, 'LotConfig')

In [83]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
LotConfig,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CulDSac,70,217030.242857,12.222853,6.392694
Corner,204,179629.906863,12.024583,18.630137
Inside,786,178854.849873,12.007129,71.780822
FR2,33,177832.727273,12.039213,3.013699
FR3,2,167500.0,12.00013,0.182648


A reasonable number of the properties look to be coming in insids lots. Though we would not expected much , let us still see if this information would yield us something useful.

In [84]:
rel_x_cols = ['LogGrLivArea', 
              'MSSubClass_60_75_120_20', 
              'OverallQual', 
              'OverallCond', 
              'Neighborhood_Val', 
              'LotConfig_Inside']

In [85]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16075783908320504


In [86]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.1621394903119038


As expected, there does not look to be anything much here.

#### LandSlope

In [87]:
results_df = get_mean_count_per_group(train_data, 'LandSlope')

In [88]:
results_df.sort_values(['mean_SalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,percent_total_size
LandSlope,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sev,11,203357.272727,12.113266,1.004566
Mod,48,198929.5625,12.093006,4.383562
Gtl,1036,180342.143822,12.021044,94.611872


There is not much point in moving forward here, as a huge majority (more than 94%) of the plots have a gentle slope.

#### Conditions.

Let us have a look at the condition variables here. There are 2 variables here , indicating that we can have more than one condition for a house.

In [89]:
train_data['Condition1'].unique()

array(['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA',
       'RRNe'], dtype=object)

In [90]:
train_data['Condition2'].unique()

array(['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn'],
      dtype=object)

Let us check those cases when we have a genuine case of 2 conditions, that is when one of the conditions is not 'Normal'.

In [91]:
train_data[(train_data['Condition1'] != 'Norm') & 
           (train_data['Condition2'] != 'Norm') &
           (train_data['Condition1'] != train_data['Condition2'])]\
[['Id', 'Condition1', 'Condition2']]

Unnamed: 0,Id,Condition1,Condition2
29,30,Feedr,RRNn
63,64,RRAn,Feedr
184,185,RRAn,Feedr
531,532,RRNn,Feedr
548,549,Feedr,RRNn
583,584,Artery,PosA
589,590,RRAn,Feedr
974,975,RRAn,Feedr
1003,1004,Feedr,RRAn


The number looks too few. Let us check those cases when both conditions are not 'Normal'.

In [92]:
train_data[(train_data['Condition1'] != 'Norm') |
           (train_data['Condition2'] != 'Norm')]\
[['Id', 'Condition1', 'Condition2']]

Unnamed: 0,Id,Condition1,Condition2
1,2,Feedr,Norm
7,8,PosN,Norm
8,9,Artery,Norm
9,10,Artery,Artery
18,19,RRAe,Norm
29,30,Feedr,RRNn
30,31,Feedr,Norm
52,53,RRNn,Norm
60,61,RRAe,Norm
63,64,RRAn,Feedr


Now, this means that roughly 13% of the data, has nothing apart from 'Normal' condition.

This brings into question , the effectiveness of a predictor built around this variable, as it is going to have one value for almost 87% of the data.

Let us have another look at the number of cases when the conditions are listed differently.

In [93]:
train_data[train_data['Condition1'] != train_data['Condition2']][['Id', 'Condition1', 'Condition2']]

Unnamed: 0,Id,Condition1,Condition2
1,2,Feedr,Norm
7,8,PosN,Norm
8,9,Artery,Norm
18,19,RRAe,Norm
29,30,Feedr,RRNn
30,31,Feedr,Norm
52,53,RRNn,Norm
60,61,RRAe,Norm
63,64,RRAn,Feedr
66,67,PosA,Norm


After going through all this, let us have a first stab at using this data. 

Let us start with a variable, that would simply add the norm conditions.

In [94]:
transform_column_train_validation_test(train_data_one_hot, validation_data_one_hot, test_data_one_hot,
                                       'Condition_Norm',
                                       ['Condition1_Norm', 'Condition2_Norm'])



In [95]:
train_data_one_hot['Condition_Norm'] = train_data_one_hot['Condition_Norm'].apply(lambda x : np.bool(x))
validation_data_one_hot['Condition_Norm'] = validation_data_one_hot['Condition_Norm'].apply(lambda x : np.bool(x))
test_data_one_hot['Condition_Norm'] = test_data_one_hot['Condition_Norm'].apply(lambda x : np.bool(x))

In [96]:
train_data_one_hot['Condition_Norm'].describe()

count     1095
unique       2
top       True
freq      1082
Name: Condition_Norm, dtype: object

In [97]:
train_data_one_hot['Condition_Norm'].unique()

array([ True, False])

In [98]:
rel_x_cols = ['LogGrLivArea', 
              'MSSubClass_60_75_120_20', 
              'OverallQual', 
              'OverallCond', 
              'Neighborhood_Val', 
              'Condition_Norm']

In [99]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)  
print(cross_validation_score)

0.16087865481119112


In [100]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.1627002604384478


Comment : Okay, that is an encouragement. We do looking to be getting a boost here. The question is whether we can refine it further.

This is the refined indicator which we propose :

(i) For each value of the Condition variable, compute the average selling price for the cases when one of the variable (Condition1 or Condition2) is equal to the Condition Variable.
(ii) Now, going over each record, do the following :

    (a) Add the average log Sale price for each of the Conditions present in Condition1 and Condition 2 columns.
    
    
This way, rather thna simply adding occurences, we add the mean average log Sale price , for each condition, which makes more sense.

In [101]:
train_data['Condition1'].unique()

array(['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA',
       'RRNe'], dtype=object)

In [102]:
train_data['Condition2'].unique()

array(['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn'],
      dtype=object)

In [103]:
def get_condition_to_logSalePrice(train_data):
    condition_to_logSalePrice = dict()
    conditions = train_data['Condition1'].unique()
    for condition in conditions:
        #print(condition)
        avg_logSalePrice = \
            train_data[
                (train_data['Condition1'] == condition) | 
                (train_data['Condition2'] == condition) ]['LogSalePrice'].mean()
        #print(avg_logSalePrice)
        condition_to_logSalePrice[condition] = avg_logSalePrice
    return condition_to_logSalePrice

In [104]:
condition_to_logSalePrice = get_condition_to_logSalePrice(train_data)

In [105]:
condition_to_logSalePrice

{'Artery': 11.732441553254347,
 'Feedr': 11.812712004906377,
 'Norm': 12.027202075313038,
 'PosA': 12.30290154907993,
 'PosN': 12.226582648618214,
 'RRAe': 11.825098630730913,
 'RRAn': 12.017136649782707,
 'RRNe': 12.15852566893868,
 'RRNn': 11.864747799498494}

In [106]:
def process_conditions(condition_1, condition_2, condition_to_logSalePrice):
    return (condition_to_logSalePrice.get(condition_1) + condition_to_logSalePrice.get(condition_2))

In [107]:
train_data['Condition_Val'] = train_data.apply(
    lambda row: process_conditions(row['Condition1'],
                                   row['Condition2'],
                                   condition_to_logSalePrice), axis=1)
validation_data['Condition_Val'] = validation_data.apply(
    lambda row: process_conditions(row['Condition1'],
                                   row['Condition2'],
                                   condition_to_logSalePrice), axis=1)

test_data['Condition_Val'] = test_data.apply(
    lambda row: process_conditions(row['Condition1'],
                                   row['Condition2'],
                                   condition_to_logSalePrice), axis=1)


In [108]:
train_data_one_hot['Condition_Val'] = train_data['Condition_Val']
validation_data_one_hot['Condition_Val'] = validation_data['Condition_Val']
test_data_one_hot['Condition_Val'] = test_data['Condition_Val']

In [109]:
rel_x_cols = ['LogGrLivArea', 
              'MSSubClass_60_75_120_20', 
              'OverallQual', 
              'OverallCond', 
              'Neighborhood_Val', 
              'Condition_Val']

In [110]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_cross_validate( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16044332453739735


In [111]:
(my_pipeline, cross_validation_score) = fit_pipeline_and_evaluate_on_validation_set( 
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=rel_x_cols)
print(cross_validation_score)

0.16068525286452798


### Can't we do better here ?

This is a reasonable question and one needs to keep in mind that we are dealing with a variable that is having one value 87% of the same (both conditions are set to 'Norm'). So definitely, the predictive power will be limited.

Hence, we are stopping on this for now, until new ideas dawn on me !!!

### Another idea ?

In order to dril on more variables, shouldnt't it be better if have a normalized independent variable, say something like price per square feet , instead of square feet.

One might claim that it should not matter since we have GrLivArea in the logarithmic scale itself as one of the predictors.However, this does affect the design of new variables and we have designed many of them using means of sale prices and it might have been more appropriate to design them using means of prices per square feet to get rid of the variability due to living area.


In [112]:
train_data_one_hot['LogSalePricePerSqFeet'] = train_data_one_hot['LogSalePrice'] - train_data_one_hot['LogGrLivArea']
validation_data_one_hot['LogSalePricePerSqFeet'] = \
    validation_data_one_hot['LogSalePrice'] - validation_data_one_hot['LogGrLivArea']

train_data['LogSalePricePerSqFeet'] = train_data['LogSalePrice'] - train_data['LogGrLivArea']
validation_data['LogSalePricePerSqFeet'] = \
    validation_data['LogSalePrice'] - validation_data['LogGrLivArea']


### Start with the first model.

If all is well, this should identical results with the scenario when we used 'LogSalePrice' itself.

In [113]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea'],
    Y_column='LogSalePricePerSqFeet')[1]

0.27161716540345976

Okay , let us check if this holds up when we add another variable.

In [114]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20'],
    Y_column='LogSalePricePerSqFeet')[1]


0.23998906193907085

In [115]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 'MSSubClass_Val'],
    Y_column='LogSalePricePerSqFeet')[1]


0.25140820156722

That looks fine, but let us see if we need another design for MSSubClass variable.

In [116]:
results_df = get_mean_count_per_group(train_data, 'MSSubClass')

In [117]:
results_df.sort_values(['mean_LogSalePricePerSqFeet'], ascending=False) 

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,mean_LogSalePricePerSqFeet,percent_total_size
MSSubClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
120,68,198945.705882,12.16666,5.002448,6.210046
85,15,151113.333333,11.91859,4.975857,1.369863
20,403,184686.218362,12.048144,4.881947,36.803653
180,7,92285.714286,11.407724,4.820376,0.639269
80,38,168185.526316,12.009529,4.814403,3.47032
45,12,108591.666667,11.579033,4.807575,1.09589
60,232,241847.021552,12.349193,4.764764,21.187215
30,50,95819.02,11.436689,4.637016,4.56621
160,49,138363.938776,11.804102,4.60786,4.474886
40,3,121500.0,11.671084,4.545775,0.273973


In [118]:
results_df.sort_values(['mean_LogSalePrice'], ascending=False)

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,mean_LogSalePricePerSqFeet,percent_total_size
MSSubClass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
60,232,241847.021552,12.349193,4.764764,21.187215
120,68,198945.705882,12.16666,5.002448,6.210046
75,14,200392.857143,12.099105,4.384461,1.278539
20,403,184686.218362,12.048144,4.881947,36.803653
80,38,168185.526316,12.009529,4.814403,3.47032
70,41,166279.463415,11.949679,4.513621,3.744292
85,15,151113.333333,11.91859,4.975857,1.369863
160,49,138363.938776,11.804102,4.60786,4.474886
50,100,140537.78,11.80006,4.502164,9.13242
90,39,133278.897436,11.779705,4.479923,3.561644


The values look slightly different here and we may want to do a redesign and see whether that helps. 

Let us add a map, like that we did for neighbourhood and see whether that helps.

In [119]:
mssubclass_to_LogSalePricePerSquareFeet = dict(zip(results_df.index, results_df.mean_LogSalePricePerSqFeet)) 

In [120]:
transform_column_train_validation_test_from_dict(train_data,
                                                 validation_data,
                                                 test_data,
                                                 train_data_one_hot,
                                                 validation_data_one_hot,
                                                 test_data_one_hot,
                                                 'MSSubClass_Val',
                                                 'MSSubClass',
                                                 mssubclass_to_LogSalePricePerSquareFeet)                                                 


In [121]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 'MSSubClass_Val'],
    Y_column='LogSalePricePerSqFeet')[1]


0.21897039833536674

That looks encouraging. Let us add OverallQual

In [122]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 'MSSubClass_Val', 'OverallQual'],
    Y_column='LogSalePricePerSqFeet')[1]


0.17554079176855364

In [123]:
fit_pipeline_and_evaluate_on_validation_set(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    validation_data,
    X_columns=['LogGrLivArea', 'MSSubClass_Val', 'OverallQual'],
    Y_column='LogSalePricePerSqFeet')[1]


0.18115854194021802

We look to be doing better than our earlier method, though it must be emphasized that this method of using means to quantify a categorical variable, works better on the training set than on the validation one.

Adding OverallCond

In [124]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 'MSSubClass_Val', 'OverallQual', 'OverallCond'],
    Y_column='LogSalePricePerSqFeet')[1]


0.17214926002605713

In [125]:
fit_pipeline_and_evaluate_on_validation_set(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    validation_data,
    X_columns=['LogGrLivArea', 'MSSubClass_Val', 'OverallQual', 'OverallCond'],
    Y_column='LogSalePricePerSqFeet')[1]


0.17541842310833367

Let us add the key predictor of Neighbourhood

In [126]:
results_df = get_mean_count_per_group(train_data, 'Neighborhood')

In [127]:
results_df.sort_values(['mean_LogSalePricePerSqFeet'], ascending=False) 

Unnamed: 0_level_0,Count,mean_SalePrice,mean_LogSalePrice,mean_LogSalePricePerSqFeet,percent_total_size
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
StoneBr,22,320657.954545,12.617717,5.084896,2.009132
NridgHt,64,321781.359375,12.637217,5.081009,5.844749
Veenker,8,240062.5,12.340842,5.050478,0.730594
Somerst,65,227413.692308,12.303509,4.938959,5.936073
Timber,28,243325.964286,12.368274,4.930083,2.557078
Blmngtn,14,194023.357143,12.165725,4.922987,1.278539
CollgCr,114,195479.008772,12.15186,4.908253,10.410959
NoRidge,27,318553.333333,12.637571,4.862386,2.465753
Mitchel,35,161726.742857,11.97094,4.806364,3.196347
ClearCr,24,214159.666667,12.245548,4.805346,2.191781


In [128]:
neighborhood_to_LogSalePricePerSquareFeet = dict(zip(results_df.index, results_df.mean_LogSalePricePerSqFeet)) 

In [129]:
transform_column_train_validation_test_from_dict(train_data,
                                                 validation_data,
                                                 test_data,
                                                 train_data_one_hot,
                                                 validation_data_one_hot,
                                                 test_data_one_hot,
                                                 'Neighborhood_new_Val',
                                                 'Neighborhood',
                                                 neighborhood_to_LogSalePricePerSquareFeet)                                                 


In [130]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 'MSSubClass_Val', 'OverallQual', 'OverallCond', 'Neighborhood_new_Val'],
    Y_column='LogSalePricePerSqFeet')[1]


0.15503440366046023

In [131]:
fit_pipeline_and_evaluate_on_validation_set(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    validation_data,
    X_columns=['LogGrLivArea', 'MSSubClass_Val', 'OverallQual', 'OverallCond', 'Neighborhood_new_Val'],
    Y_column='LogSalePricePerSqFeet')[1]


0.16270112741137843

#### Overfitting while doing cross validation.

This once again highlights the issue of us overfitting on the training set, since we use mean values from the set itself while designing predictors.

Though, we are somehow 'insured' against dangerous overfitting by the presence of a validation set, it would still be better if we were to have a clear cross validation procedure which took this into account(that is, during every round of cross validation, it would take mean of the values in the training set and apply them to predict values in the validation set, preventing overfitting).

Another thing which remains to be tested is how the results standard when we properly randomize training and validation sets (we should have done this here, but did not do it !)

In [132]:
fit_pipeline_and_cross_validate(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    X_columns=['LogGrLivArea', 
               'MSSubClass_Val', 
               'OverallQual', 
               'OverallCond', 
               'Neighborhood_Val'],
    Y_column='LogSalePricePerSqFeet')[1]


0.15585270045871383

In [133]:
fit_pipeline_and_evaluate_on_validation_set(
    make_pipeline(Imputer(), linear_model.LinearRegression()), 
    train_data, 
    validation_data,
    X_columns=['LogGrLivArea', 
               'MSSubClass_Val', 
               'OverallQual', 
               'OverallCond', 
               'Neighborhood_Val'],
    Y_column='LogSalePricePerSqFeet')[1]


0.15604738233645235

This is echoes the point which we had made earlier. We have this weird set of predictors with MSSubClass classified according to price per square feet and Neighbourhood classified according to price and all these work well together.

Before we proceed any more on designing new predictors , we need to do the following :

1. Write up a thorough cross validaiton routine. When I say thorough, what I mean is that we should be able to test out features, which use mean of prices etc., within the cross validation routine. These features are really tricky, and it would be really difficult to judge them without doing a thorough cross validation.

2. Repeat the tests with randomized input data. This should have done earlier. This is because it is easy to us to end up with a nice training set and end up training the model only on the same. Randomization would ensure that model works on other input data as well and hence increase robustness of the results

### Thorough routine for doing cross validation.

In [134]:
def get_dummy_transformation(train_data, validation_data):
    train_data['LogGrLivArea_Val'] = train_data['LogGrLivArea']
    validation_data['LogGrLivArea_Val'] = validation_data['LogGrLivArea']    
    return 'LogGrLivArea_Val'

In [135]:
def get_MSSubClass_indicator_transformation(train_data, validation_data):
    train_data['MSSubClass_60_75_120_20'] = train_data['MSSubClass_120'] + \
    train_data['MSSubClass_60'] + train_data['MSSubClass_20'] + train_data['MSSubClass_75'] 
    validation_data['MSSubClass_60_75_120_20'] = validation_data['MSSubClass_120'] + \
    validation_data['MSSubClass_60'] + validation_data['MSSubClass_20'] + validation_data['MSSubClass_75']
    return 'MSSubClass_60_75_120_20'

In [136]:
def get_cross_val_output(input_df, X_columns=['LogGrLivArea'], X_column_transform_map={}, Y_column = 'LogSalePricePerSqFeet', nfolds=5):
    partition_indices = np.array_split(np.arange(len(input_df)), nfolds)
    
    cross_validated_scores = np.zeros(nfolds)
    cross_validated_data = pd.DataFrame(columns=input_df.columns)
    for i in range(nfolds):
        cross_validated_set = input_df[partition_indices[i][0]:partition_indices[i][-1] + 1].copy()
        rel_training_data = pd.DataFrame(columns=input_df.columns)
        for j in range(nfolds):
            if j != i:
                training_set = input_df[partition_indices[j][0]:partition_indices[j][-1] + 1]
                rel_training_data = pd.concat([rel_training_data, training_set])

        
        rel_X_cols = list()
        for col in X_columns:
            if col in X_column_transform_map.keys():
                rel_col = X_column_transform_map.get(col)(rel_training_data, cross_validated_set)
            else:
                rel_col = col
            rel_X_cols.append(rel_col)
        X = rel_training_data[rel_X_cols]
        Y = rel_training_data[[Y_column]].values.ravel()
        my_model = linear_model.LinearRegression()
        my_model.fit(X,Y)
        newX = cross_validated_set[rel_X_cols]
        newY = cross_validated_set[[Y_column]]
        cross_validated_score = evaluate_model_score(my_model, newX, newY)
        cross_validated_scores[i] = cross_validated_score
        
    return cross_validated_scores

In [137]:
cross_validation_scores = get_cross_val_output(train_data)
print(cross_validation_scores)
print(cross_validation_scores.mean())
print(cross_validation_scores.std())


[0.26563421 0.25911464 0.27404988 0.27610276 0.28318435]
0.27161716540345976
0.00838984347419547


Let us see how things look after the addition of our initial MSSubClass variable.

In [138]:
transform_column_train_validation_test(train_data_one_hot, validation_data_one_hot, test_data_one_hot,
                                      'MSSubClass_60_75_120_20', 
                                      ['MSSubClass_120', 'MSSubClass_60', 'MSSubClass_20', 'MSSubClass_75'])

In [139]:
cross_validation_scores = get_cross_val_output(train_data_one_hot, X_columns=['LogGrLivArea', 'MSSubClass_60_75_120_20'])
print(cross_validation_scores)
print(cross_validation_scores.mean())
print(cross_validation_scores.std())


[0.2299178  0.22909246 0.24402261 0.24979964 0.24711279]
0.23998906193907085
0.008757063005795292


Now, comes the real tricky part where we use grouped means to get a quantified version of the variable. Let us see how to get it !!

In [140]:
cross_validation_scores = get_cross_val_output(
    train_data_one_hot, 
    X_columns=['LogGrLivArea', 'MSSubClass'],
    X_column_transform_map={'MSSubClass' : get_MSSubClass_indicator_transformation})
print(cross_validation_scores)
print(cross_validation_scores.mean())
print(cross_validation_scores.std())


[0.2299178  0.22909246 0.24402261 0.24979964 0.24711279]
0.23998906193907085
0.008757063005795292


That looks great, now let us make sure that we can get a validation score on a validation set also easily by writing a routine that is almost identical to the one written above.

In [141]:
def get_validation_output(input_df, validation_df, X_columns=['LogGrLivArea'], X_column_transform_map={}, Y_column = 'LogSalePricePerSqFeet', nfolds=5):
    rel_X_cols = list()
    for col in X_columns:
        if col in X_column_transform_map.keys():
            rel_col = X_column_transform_map.get(col)(input_df, validation_df)
        else:
            rel_col = col
        rel_X_cols.append(rel_col)
    X = input_df[rel_X_cols]
    Y = input_df[[Y_column]].values.ravel()
    my_model = linear_model.LinearRegression()
    my_model.fit(X,Y)
    newX = validation_df[rel_X_cols]
    newY = validation_df[[Y_column]]
    cross_validated_score = evaluate_model_score(my_model, newX, newY)
        
    return cross_validated_score 

In [142]:
validation_score = get_validation_output(
    train_data_one_hot, 
    validation_data_one_hot,
    X_columns=['LogGrLivArea', 'MSSubClass'],
    X_column_transform_map={'MSSubClass' : get_MSSubClass_indicator_transformation})
print(validation_score)


0.23849795274552374


Now, let us make sure that we can make predictions on test data as well (applying the same transformations)

In [143]:
def get_test_data_predictions(input_df, test_df, X_columns=['LogGrLivArea'], X_column_transform_map={}, Y_column = 'LogSalePricePerSqFeet', nfolds=5):
    rel_X_cols = list()
    for col in X_columns:
        if col in X_column_transform_map.keys():
            rel_col = X_column_transform_map.get(col)(input_df, test_df)
        else:
            rel_col = col
        rel_X_cols.append(rel_col)
    X = input_df[rel_X_cols]
    Y = input_df[[Y_column]].values.ravel()
    my_model = linear_model.LinearRegression()
    my_model.fit(X,Y)
    newX = test_df[rel_X_cols]
    predicitons =  my_model.predict(newX)
        
    return predicitons

In [144]:
def log_transform_living_area(train_data, validation_data):
    train_data['LogGrLivArea'] = train_data['GrLivArea'].apply(lambda x : np.log(1.0 + x))
    validation_data['LogGrLivArea'] = validation_data['GrLivArea'].apply(lambda x : np.log(1.0 + x))
    return 'LogGrLivArea'

In [145]:
get_test_data_predictions(complete_train_data, 
                          test_data, 
                          X_columns=['GrLivArea'], 
                          X_column_transform_map = {'GrLivArea' : log_transform_living_area},
                          Y_column = 'SalePrice')

array([103138.07938093, 168398.91942366, 202099.99630156, ...,
       154773.06959036, 116272.25198311, 236077.10379031])

That is great. Now let us just redefine the functions, modularizing common code.

In [146]:
def get_rel_X_cols(input_df, validation_df, X_columns, X_column_transform_map):
    rel_X_cols = list()
    for col in X_columns:
        if col in X_column_transform_map.keys():
            rel_col = X_column_transform_map.get(col)(input_df, validation_df)
        else:
            rel_col = col
        rel_X_cols.append(rel_col)
    return rel_X_cols

In [147]:
def get_trained_model(X, Y):
    my_model = linear_model.LinearRegression()
    my_model.fit(X,Y)

    return my_model

In [148]:
def get_cross_val_output(input_df, X_columns=['LogGrLivArea'], X_column_transform_map={}, Y_column = 'LogSalePricePerSqFeet', nfolds=5):
    partition_indices = np.array_split(np.arange(len(input_df)), nfolds)
    
    cross_validated_scores = np.zeros(nfolds)
    cross_validated_data = pd.DataFrame(columns=input_df.columns)
    for i in range(nfolds):
        cross_validated_set = input_df[partition_indices[i][0]:partition_indices[i][-1] + 1].copy()
        rel_training_data = pd.DataFrame(columns=input_df.columns)
        for j in range(nfolds):
            if j != i:
                training_set = input_df[partition_indices[j][0]:partition_indices[j][-1] + 1]
                rel_training_data = pd.concat([rel_training_data, training_set])

        
        rel_X_cols = get_rel_X_cols(rel_training_data, cross_validated_set, X_columns, X_column_transform_map)
        my_model = get_trained_model(rel_training_data[rel_X_cols], 
                                     rel_training_data[[Y_column]].values.ravel())
        newX = cross_validated_set[rel_X_cols]
        newY = cross_validated_set[[Y_column]]
        cross_validated_score = evaluate_model_score(my_model, newX, newY)
        cross_validated_scores[i] = cross_validated_score
        
    return cross_validated_scores

In [149]:
def get_validation_output(input_df, validation_df, X_columns=['LogGrLivArea'], X_column_transform_map={}, Y_column = 'LogSalePricePerSqFeet', nfolds=5):
    rel_X_cols = get_rel_X_cols(input_df, validation_df, X_columns, X_column_transform_map)
    my_model = get_trained_model(input_df[rel_X_cols], 
                                 input_df[[Y_column]].values.ravel())

    newX = validation_df[rel_X_cols]
    newY = validation_df[[Y_column]]
    cross_validated_score = evaluate_model_score(my_model, newX, newY)
        
    return cross_validated_score 

In [150]:
def get_test_data_predictions(input_df, test_df, X_columns=['LogGrLivArea'], X_column_transform_map={}, Y_column = 'LogSalePricePerSqFeet', nfolds=5):
    rel_X_cols = get_rel_X_cols(input_df, test_df, X_columns, X_column_transform_map)        
    my_model = get_trained_model(input_df[rel_X_cols], 
                                 input_df[[Y_column]].values.ravel())
    
    newX = test_df[rel_X_cols]
    predicitons =  my_model.predict(newX)
        
    return predicitons

Looks like we are all set now. Let us continue the rest in a new notebook.