In [1]:
# Importing the libs
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set() # default configs

%matplotlib inline

In [2]:
# Import the dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [4]:
# Checkout dataset
test.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       227
LotArea             0
Street              0
Alley            1352
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         16
MasVnrArea         15
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu       730
GarageType         76
GarageYrBlt        78
GarageFinish       78
GarageCars          1
GarageArea          1
GarageQual         78
GarageCond

In [5]:
def face_plt(feature, limit = False, xmin = None, xmax = None):
    facet = sns.FacetGrid(train, hue = 'SalePrice', aspect = 4)
    facet.map(sns.kdeplot, feature, shade = True)
    facet.set(xlim = (0, train[feature].max()))
    facet.add_legend()
    if limit is True:
        plt.xlim(xmin,xmax)
    plt.show()

In [6]:
dist_plt = lambda feature,dataset : sns.distplot(train[feature]) if dataset == 0 else sns.distplot(test[feature])

In [7]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
# train['MSZoning'].fillna(train.groupby('Neighborhood')['MSZoning'].mode(), inplace = True)

In [9]:
train.groupby('Neighborhood')['MSZoning'].value_counts()

Neighborhood  MSZoning
Blmngtn       RL           16
              RM            1
Blueste       RM            2
BrDale        RM           16
BrkSide       RM           30
              RL           28
ClearCr       RL           28
CollgCr       RL          140
              RM           10
Crawfor       RL           46
              RM            3
              RH            2
Edwards       RL           90
              RM            8
              RH            2
Gilbert       RL           79
IDOTRR        RM           28
              C (all)       9
MeadowV       RM           17
Mitchel       RL           44
              RM            5
NAmes         RL          223
              RH            2
NPkVill       RL            9
NWAmes        RL           73
NoRidge       RL           41
NridgHt       RL           76
              RM            1
OldTown       RM           95
              RL           17
              C (all)       1
SWISU         RL           20
              RH 

In [10]:
train.MSZoning.value_counts()

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

In [11]:
'''
RL: 0
RM: 1
FV: 2
RH: 3
C (all): 4
'''

'\nRL: 0\nRM: 1\nFV: 2\nRH: 3\nC (all): 4\n'

In [12]:
train_test_data = [train, test]

In [13]:
mszoning_mapping = {'RL': 0, 'RM': 1, 'FV': 2, 'RH': 3, 'C (all)': 4}

for dataset in train_test_data:
    dataset['MSZoning_categories'] = dataset['MSZoning'].map(mszoning_mapping)

In [14]:
test.MSZoning_categories.isnull().sum()

4

In [15]:
# # Taking care of missing data
# from sklearn.preprocessing import Imputer
# imputer = Imputer(missing_values = 'NaN', strategy = 'most_frequent', axis = 0)
# data = test.iloc[:, [test.columns.get_loc('MSZoning_categories')]].values
# test['MSZoning_categories'] = imputer.fit_transform(data)

In [16]:
neighbors = test.loc[pd.isnull(test['MSZoning_categories']), 'Neighborhood'].tolist()
neighbors = set(neighbors)

In [17]:
neighbors = list(neighbors)
print(neighbors)

['Mitchel', 'IDOTRR']


In [18]:
from scipy import stats
modes_arr = test.groupby('Neighborhood')['MSZoning_categories'].apply(lambda x : stats.mode(x)[0][0])[neighbors]
values = list(modes_arr)

dict_ms_zoning = {}
for i in range(len(neighbors)):
    dict_ms_zoning[neighbors[i]] = values[i]

print (dict_ms_zoning)

{'Mitchel': 0.0, 'IDOTRR': 1.0}


In [19]:
# MSZoning filling NaN values 
test.loc[(pd.isnull(test['MSZoning_categories'])) & (test['Neighborhood'] == 'Mitchel'), 'MSZoning_categories'] = dict_ms_zoning['Mitchel']
test.loc[(pd.isnull(test['MSZoning_categories'])) & (test['Neighborhood'] == 'IDOTRR'), 'MSZoning_categories'] = dict_ms_zoning['IDOTRR']

In [20]:
test.MSZoning_categories.isnull().sum()

0

In [21]:
# correlation wrt one feature in acending or decending order as prefferred
corr_df = pd.DataFrame(train.corr()['SalePrice'])
corr_df.sort_values(by = 'SalePrice', ascending = False)

Unnamed: 0,SalePrice
SalePrice,1.0
OverallQual,0.790982
GrLivArea,0.708624
GarageCars,0.640409
GarageArea,0.623431
TotalBsmtSF,0.613581
1stFlrSF,0.605852
FullBath,0.560664
TotRmsAbvGrd,0.533723
YearBuilt,0.522897


In [22]:
train['LotFrontage'].fillna(train.groupby(['Neighborhood', 'BldgType'])['LotFrontage'].transform('median'), inplace = True)
test['LotFrontage'].fillna(test.groupby(['Neighborhood', 'BldgType'])['LotFrontage'].transform('median'), inplace = True)

In [23]:
test.LotFrontage.isnull().sum()

1

In [24]:
# Still NaN means, those houses dont have the frontage
train['LotFrontage'].fillna(0, inplace = True)
test['LotFrontage'].fillna(0, inplace = True)

In [25]:
test.LotFrontage.isnull().sum()

0

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 82 columns):
Id                     1460 non-null int64
MSSubClass             1460 non-null int64
MSZoning               1460 non-null object
LotFrontage            1460 non-null float64
LotArea                1460 non-null int64
Street                 1460 non-null object
Alley                  91 non-null object
LotShape               1460 non-null object
LandContour            1460 non-null object
Utilities              1460 non-null object
LotConfig              1460 non-null object
LandSlope              1460 non-null object
Neighborhood           1460 non-null object
Condition1             1460 non-null object
Condition2             1460 non-null object
BldgType               1460 non-null object
HouseStyle             1460 non-null object
OverallQual            1460 non-null int64
OverallCond            1460 non-null int64
YearBuilt              1460 non-null int64
YearRemodAdd        

In [27]:
train['Alley'].fillna('NoAlley', inplace = True)
test['Alley'].fillna('NoAlley', inplace = True)

In [28]:
'''
BrkCmn : 0
BrkFace : 1
CBlock: 2
None: 3
Stone: 4
'''

'\nBrkCmn : 0\nBrkFace : 1\nCBlock: 2\nNone: 3\nStone: 4\n'

In [29]:
masvn_mapping = {"BrkCmn" : 0, "BrkFace" : 1, "CBlock": 2, "None": 3, "Stone": 4}
for dataset in train_test_data:
    dataset['MasVnrType_values'] = dataset['MasVnrType'].map(masvn_mapping)

In [30]:
train['MasVnrType_values'].fillna(train.groupby(['Neighborhood', 'BldgType'])['MasVnrType_values'].transform('median'), inplace = True)
test['MasVnrType_values'].fillna(test.groupby(['Neighborhood', 'BldgType'])['MasVnrType_values'].transform('median'), inplace = True)

In [31]:
train['MasVnrArea'].isnull().sum()

8

In [32]:
test['MasVnrArea'].isnull().sum()

15

In [33]:
train['MasVnrArea'].fillna(train.groupby(['Neighborhood', 'BldgType'])['MasVnrArea'].transform('median'), inplace = True)
test['MasVnrArea'].fillna(test.groupby(['Neighborhood', 'BldgType'])['MasVnrArea'].transform('median'), inplace = True)

In [34]:
train['MasVnrArea'].isnull().sum()

0

In [35]:
test['MasVnrArea'].isnull().sum()

0

## Remove rows with min. NaN column values

In [36]:
train['BsmtQual'].fillna('NA', inplace = True)
test['BsmtQual'].fillna('NA', inplace = True)

train['BsmtCond'].fillna('NA', inplace = True)
test['BsmtCond'].fillna('NA', inplace = True)

train['BsmtExposure'].fillna('NA', inplace = True)
test['BsmtExposure'].fillna('NA', inplace = True)

train['BsmtFinType1'].fillna('NA', inplace = True)
test['BsmtFinType1'].fillna('NA', inplace = True)

train['BsmtFinType2'].fillna('NA', inplace = True)
test['BsmtFinType2'].fillna('NA', inplace = True)

train['FireplaceQu'].fillna('NA', inplace = True)

train['GarageType'].fillna('NA', inplace = True)

train['GarageYrBlt'].fillna(0, inplace = True)

train['GarageFinish'].fillna('NA', inplace = True)

train['GarageQual'].fillna('NA', inplace = True)

train['GarageCond'].fillna('NA', inplace = True)

train['PoolQC'].fillna('NA', inplace = True)

train['Fence'].fillna('NA', inplace = True)

train['MiscFeature'].fillna('NA', inplace = True)

In [37]:
# only 1 row is NaN hence filling it with most type of electricals in that type of house - : no explicit queries
train['Electrical'].fillna('SBrkr', inplace = True)

In [38]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 83 columns):
Id                     1460 non-null int64
MSSubClass             1460 non-null int64
MSZoning               1460 non-null object
LotFrontage            1460 non-null float64
LotArea                1460 non-null int64
Street                 1460 non-null object
Alley                  1460 non-null object
LotShape               1460 non-null object
LandContour            1460 non-null object
Utilities              1460 non-null object
LotConfig              1460 non-null object
LandSlope              1460 non-null object
Neighborhood           1460 non-null object
Condition1             1460 non-null object
Condition2             1460 non-null object
BldgType               1460 non-null object
HouseStyle             1460 non-null object
OverallQual            1460 non-null int64
OverallCond            1460 non-null int64
YearBuilt              1460 non-null int64
YearRemodAdd      

In [39]:
test['Utilities'].fillna('AllPub', inplace = True)

In [40]:
test['Exterior2nd'].value_counts()

VinylSd    510
MetalSd    233
HdBoard    199
Wd Sdng    194
Plywood    128
CmentBd     66
Wd Shng     43
BrkFace     22
Stucco      21
AsbShng     18
Brk Cmn     15
ImStucc      5
CBlock       2
Stone        1
AsphShn      1
Name: Exterior2nd, dtype: int64

In [41]:
exterior_mapping = {"AsbShng": 0, "AsphShn": 1, "BrkComm": 2, "BrkFace": 3, "CBlock": 4, "CemntBd": 5, "HdBoard": 6, "ImStucc": 7, "MetalSd": 8, "Other": 9, "Plywood": 10, "PreCast": 11, "Stone": 12, "Stucco": 13, "VinylSd": 14, "Wd Sdng": 15, "WdShing": 16}
for dataset in train_test_data:
    dataset['Exterior1st_values'] = dataset['Exterior1st'].map(exterior_mapping)

In [42]:
exterior_mapping = {"AsbShng": 0, "AsphShn": 1, "Brk Cmn": 2, "BrkFace": 3, "CBlock": 4, "CmentBd": 5, "HdBoard": 6, "ImStucc": 7, "MetalSd": 8, "Other": 9, "Plywood": 10, "PreCast": 11, "Stone": 12, "Stucco": 13, "VinylSd": 14, "Wd Sdng": 15, "Wd Shng": 16}
for dataset in train_test_data:
    dataset['Exterior2nd_values'] = dataset['Exterior2nd'].map(exterior_mapping)

In [43]:
test['Exterior1st_values'].fillna(test.groupby(['Neighborhood', 'BldgType'])['Exterior1st_values'].transform('median'), inplace = True)
test['Exterior2nd_values'].fillna(test.groupby(['Neighborhood', 'BldgType'])['Exterior2nd_values'].transform('median'), inplace = True)

In [44]:
test['Exterior2nd_values'].unique().size

16

In [45]:
test['BsmtFinSF1'].fillna(test.groupby(['Neighborhood', 'BldgType'])['BsmtFinSF1'].transform('median'), inplace = True)
test['BsmtFinSF2'].fillna(test.groupby(['Neighborhood', 'BldgType'])['BsmtFinSF2'].transform('median'), inplace = True)
test['BsmtUnfSF'].fillna(test.groupby(['Neighborhood', 'BldgType'])['BsmtUnfSF'].transform('median'), inplace = True)
test['TotalBsmtSF'].fillna(test.groupby(['Neighborhood', 'BldgType'])['TotalBsmtSF'].transform('median'), inplace = True)
test['BsmtFullBath'].fillna(test.groupby(['Neighborhood', 'BldgType'])['BsmtFullBath'].transform('median'), inplace = True)
test['BsmtHalfBath'].fillna(test.groupby(['Neighborhood', 'BldgType'])['BsmtHalfBath'].transform('median'), inplace = True)

In [46]:
kitchen_mapping = {"EX": 0, "Gd": 1, "Td": 2, "TA": 3, "Fa": 4, "Po": 5}
for dataset in train_test_data:
    dataset['KitchenQual_values'] = dataset['KitchenQual'].map(kitchen_mapping)

In [47]:
test['KitchenQual_values'].fillna(test.groupby(['Neighborhood', 'BldgType'])['KitchenQual_values'].transform('median'), inplace = True)

In [48]:
functional_mapping = { "Typ": 0, "Min1": 1, "Min2": 2, "Mod": 3, "Maj1": 4, "Maj2": 5, "Sev": 6, "Sal": 7}
for dataset in train_test_data:
    dataset['Functional_values'] = dataset['Functional'].map(functional_mapping)

In [49]:
test['Functional_values'].fillna(test.groupby(['Neighborhood', 'BldgType'])['Functional_values'].transform('median'), inplace = True)

In [50]:
test['FireplaceQu'].fillna('NA', inplace = True)
test['GarageType'].fillna('NA', inplace = True)
test['GarageYrBlt'].fillna(0, inplace = True)
test['GarageArea'].fillna(0, inplace = True)
test['GarageCars'].fillna(0, inplace = True)
test['GarageFinish'].fillna('NA', inplace = True)
test['GarageCond'].fillna('NA', inplace = True)
test['GarageQual'].fillna('NA', inplace = True)
test['PoolQC'].fillna('NA', inplace = True)
test['MiscFeature'].fillna('NA', inplace = True)
test['Fence'].fillna('NA', inplace = True)

In [51]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 86 columns):
Id                     1459 non-null int64
MSSubClass             1459 non-null int64
MSZoning               1455 non-null object
LotFrontage            1459 non-null float64
LotArea                1459 non-null int64
Street                 1459 non-null object
Alley                  1459 non-null object
LotShape               1459 non-null object
LandContour            1459 non-null object
Utilities              1459 non-null object
LotConfig              1459 non-null object
LandSlope              1459 non-null object
Neighborhood           1459 non-null object
Condition1             1459 non-null object
Condition2             1459 non-null object
BldgType               1459 non-null object
HouseStyle             1459 non-null object
OverallQual            1459 non-null int64
OverallCond            1459 non-null int64
YearBuilt              1459 non-null int64
YearRemodAdd      

In [52]:
# Only one value missing, so did the manual analysis - below
test['SaleType'].fillna('WD', inplace = True)

In [53]:
test.groupby(['Neighborhood', 'BldgType'])['SaleType'].value_counts()

Neighborhood  BldgType  SaleType
Blmngtn       1Fam      New          2
              TwnhsE    WD           7
                        New          2
Blueste       Twnhs     WD           4
              TwnhsE    WD           4
BrDale        Twnhs     WD          11
                        COD          1
              TwnhsE    WD           2
BrkSide       1Fam      WD          45
                        COD          1
                        CWD          1
                        ConLD        1
                        ConLI        1
                        ConLw        1
ClearCr       1Fam      WD          13
                        COD          2
              Twnhs     WD           1
CollgCr       1Fam      WD          98
                        New         14
                        Con          1
              Duplex    WD           1
              TwnhsE    WD           3
Crawfor       1Fam      WD          40
                        CWD          2
              2fmCon    WD     

In [54]:
test[pd.isnull(test['SaleType'])][['Neighborhood', 'BldgType']]

Unnamed: 0,Neighborhood,BldgType


In [55]:
'''
MSZoning_categories    1459 non-null float64
MasVnrType_values      1459 non-null float64
Exterior1st_values     1459 non-null float64
Exterior2nd_values     1459 non-null float64
KitchenQual_values     1459 non-null float64
Functional_values 
'''

'\nMSZoning_categories    1459 non-null float64\nMasVnrType_values      1459 non-null float64\nExterior1st_values     1459 non-null float64\nExterior2nd_values     1459 non-null float64\nKitchenQual_values     1459 non-null float64\nFunctional_values \n'

In [56]:
test.drop(['MSZoning', 'MasVnrType', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional'], axis = 1, inplace = True)
train.drop(['MSZoning', 'MasVnrType', 'Exterior1st', 'Exterior2nd', 'KitchenQual', 'Functional'], axis = 1, inplace = True)

# DANGAL

In [57]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id                     1460 non-null int64
MSSubClass             1460 non-null int64
LotFrontage            1460 non-null float64
LotArea                1460 non-null int64
Street                 1460 non-null object
Alley                  1460 non-null object
LotShape               1460 non-null object
LandContour            1460 non-null object
Utilities              1460 non-null object
LotConfig              1460 non-null object
LandSlope              1460 non-null object
Neighborhood           1460 non-null object
Condition1             1460 non-null object
Condition2             1460 non-null object
BldgType               1460 non-null object
HouseStyle             1460 non-null object
OverallQual            1460 non-null int64
OverallCond            1460 non-null int64
YearBuilt              1460 non-null int64
YearRemodAdd           1460 non-null int64
RoofStyle          

In [58]:
train = train.drop('Id', axis = 1).copy()
df_ids = test['Id']
test = test.drop('Id', axis = 1).copy()

# Model stuff below

In [59]:
X = train.drop('SalePrice', axis = 1).copy()
y = train['SalePrice']

In [60]:
features_to_be_endcoded = [key for key in dict(train.dtypes) if dict(train.dtypes)[key] in ['object']] 
features_to_be_endcoded.extend(['MSZoning_categories', 'MasVnrType_values', 'Exterior1st_values', 'Exterior2nd_values', 'KitchenQual_values', 'Functional_values'])
train_objs_num = len(X)
dataset = pd.concat(objs=[X, test], axis=0)
dataset = pd.get_dummies(dataset, columns = features_to_be_endcoded, prefix = features_to_be_endcoded, drop_first= True)
train = pd.DataFrame.copy(dataset[:train_objs_num])
test = pd.DataFrame.copy(dataset[train_objs_num:])

In [61]:
print(train.shape)
print(test.shape)

(1460, 260)
(1459, 260)


In [62]:
new_df = pd.concat(objs = [train, y], axis = 1)

In [63]:
corr_df = pd.DataFrame(new_df.corr()['SalePrice'])
corr_df.sort_values(by = 'SalePrice', ascending= False)

Unnamed: 0,SalePrice
SalePrice,1.000000
OverallQual,0.790982
GrLivArea,0.708624
GarageCars,0.640409
GarageArea,0.623431
TotalBsmtSF,0.613581
1stFlrSF,0.605852
FullBath,0.560664
TotRmsAbvGrd,0.533723
YearBuilt,0.522897


In [64]:
threshold = 0.30
df  = corr_df['SalePrice'].gt(threshold, 0)

In [65]:
columns = []
dict(df)
for k,v in df.items():
    if v:
        columns.append(k)

columns.remove('SalePrice')

In [66]:
train = train[columns]
test = test[columns]

In [67]:
# Splitting the dataset into the Training set and Test set
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.2, random_state = 0)

### Adj R - square

In [68]:
from statsmodels.tools.tools import add_constant
import statsmodels.formula.api as sm
# added x0 values with 1's
train = add_constant(data = train)

In [69]:
# First model
train_opt = train.iloc[:,:]
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,216.0
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:23,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34650.0
Df Residuals:,1429,BIC:,34820.0
Df Model:,30,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.953e+05,1.46e+05,-5.457,0.000,-1.08e+06,-5.09e+05
LotFrontage,41.7671,46.601,0.896,0.370,-49.646,133.180
OverallQual,1.603e+04,1178.411,13.604,0.000,1.37e+04,1.83e+04
YearBuilt,103.5590,53.507,1.935,0.053,-1.402,208.520
YearRemodAdd,274.8571,61.397,4.477,0.000,154.419,395.295
MasVnrArea,9.7907,6.105,1.604,0.109,-2.184,21.766
BsmtFinSF1,13.7214,2.870,4.781,0.000,8.091,19.352
TotalBsmtSF,7.1097,4.114,1.728,0.084,-0.961,15.181
1stFlrSF,23.3449,19.677,1.186,0.236,-15.253,61.943

0,1,2,3
Omnibus:,715.745,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103906.455
Skew:,-1.226,Prob(JB):,0.0
Kurtosis:,44.256,Cond. No.,597000.0


In [70]:
train_opt =  train_opt.drop('GarageType_Attchd', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,223.6
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:23,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34650.0
Df Residuals:,1430,BIC:,34810.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.931e+05,1.44e+05,-5.516,0.000,-1.08e+06,-5.11e+05
LotFrontage,41.1674,46.138,0.892,0.372,-49.339,131.674
OverallQual,1.602e+04,1171.717,13.672,0.000,1.37e+04,1.83e+04
YearBuilt,102.3738,51.956,1.970,0.049,0.455,204.292
YearRemodAdd,274.9415,61.369,4.480,0.000,154.558,395.324
MasVnrArea,9.7996,6.102,1.606,0.108,-2.170,21.769
BsmtFinSF1,13.7073,2.865,4.784,0.000,8.087,19.328
TotalBsmtSF,7.0974,4.111,1.726,0.084,-0.967,15.161
1stFlrSF,23.2186,19.623,1.183,0.237,-15.275,61.712

0,1,2,3
Omnibus:,714.201,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103614.019
Skew:,-1.221,Prob(JB):,0.0
Kurtosis:,44.198,Cond. No.,589000.0


In [71]:
train_opt =  train_opt.drop('FullBath', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,231.8
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:23,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34650.0
Df Residuals:,1431,BIC:,34800.0
Df Model:,28,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.873e+05,1.4e+05,-5.608,0.000,-1.06e+06,-5.12e+05
LotFrontage,41.6543,46.049,0.905,0.366,-48.676,131.985
OverallQual,1.601e+04,1171.056,13.676,0.000,1.37e+04,1.83e+04
YearBuilt,100.0255,50.389,1.985,0.047,1.182,198.870
YearRemodAdd,274.2544,61.238,4.479,0.000,154.129,394.380
MasVnrArea,9.8229,6.098,1.611,0.107,-2.140,21.786
BsmtFinSF1,13.7741,2.842,4.847,0.000,8.200,19.349
TotalBsmtSF,7.1393,4.103,1.740,0.082,-0.910,15.188
1stFlrSF,23.0017,19.582,1.175,0.240,-15.411,61.414

0,1,2,3
Omnibus:,712.425,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103081.653
Skew:,-1.216,Prob(JB):,0.0
Kurtosis:,44.092,Cond. No.,576000.0


In [72]:
train_opt =  train_opt.drop('MasVnrType_values_4.0', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,240.5
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:23,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34650.0
Df Residuals:,1432,BIC:,34790.0
Df Model:,27,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.883e+05,1.4e+05,-5.619,0.000,-1.06e+06,-5.13e+05
LotFrontage,41.0443,45.983,0.893,0.372,-49.157,131.246
OverallQual,1.603e+04,1168.877,13.717,0.000,1.37e+04,1.83e+04
YearBuilt,99.9939,50.373,1.985,0.047,1.182,198.806
YearRemodAdd,274.6820,61.199,4.488,0.000,154.632,394.732
MasVnrArea,9.9638,6.076,1.640,0.101,-1.955,21.883
BsmtFinSF1,13.8549,2.826,4.902,0.000,8.311,19.399
TotalBsmtSF,7.1421,4.102,1.741,0.082,-0.904,15.189
1stFlrSF,23.0012,19.576,1.175,0.240,-15.399,61.401

0,1,2,3
Omnibus:,709.001,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,102518.369
Skew:,-1.204,Prob(JB):,0.0
Kurtosis:,43.981,Cond. No.,575000.0


In [73]:
train_opt =  train_opt.drop('SaleCondition_Partial', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,249.9
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34640.0
Df Residuals:,1433,BIC:,34790.0
Df Model:,26,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.879e+05,1.4e+05,-5.618,0.000,-1.06e+06,-5.13e+05
LotFrontage,41.2130,45.965,0.897,0.370,-48.953,131.379
OverallQual,1.603e+04,1168.503,13.721,0.000,1.37e+04,1.83e+04
YearBuilt,99.9341,50.356,1.985,0.047,1.155,198.714
YearRemodAdd,274.5317,61.178,4.487,0.000,154.525,394.539
MasVnrArea,9.9846,6.074,1.644,0.100,-1.930,21.899
BsmtFinSF1,13.8700,2.825,4.910,0.000,8.328,19.412
TotalBsmtSF,7.1159,4.100,1.736,0.083,-0.926,15.158
1stFlrSF,22.9974,19.569,1.175,0.240,-15.390,61.385

0,1,2,3
Omnibus:,708.976,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,102440.417
Skew:,-1.204,Prob(JB):,0.0
Kurtosis:,43.965,Cond. No.,575000.0


In [74]:
train_opt =  train_opt.drop('ExterQual_Gd', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,260.0
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34640.0
Df Residuals:,1434,BIC:,34780.0
Df Model:,25,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.721e+05,1.37e+05,-5.656,0.000,-1.04e+06,-5.04e+05
LotFrontage,42.0925,45.918,0.917,0.359,-47.982,132.167
OverallQual,1.594e+04,1154.003,13.816,0.000,1.37e+04,1.82e+04
YearBuilt,95.9622,49.697,1.931,0.054,-1.525,193.449
YearRemodAdd,270.6122,60.645,4.462,0.000,151.650,389.574
MasVnrArea,10.3634,6.023,1.721,0.086,-1.452,22.179
BsmtFinSF1,14.0214,2.808,4.994,0.000,8.514,19.529
TotalBsmtSF,7.0361,4.095,1.718,0.086,-0.998,15.070
1stFlrSF,23.3683,19.550,1.195,0.232,-14.981,61.718

0,1,2,3
Omnibus:,702.488,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,101647.174
Skew:,-1.182,Prob(JB):,0.0
Kurtosis:,43.808,Cond. No.,560000.0


In [75]:
train_opt =  train_opt.drop('Foundation_PConc', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,271.0
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34640.0
Df Residuals:,1435,BIC:,34770.0
Df Model:,24,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.912e+05,1.31e+05,-6.061,0.000,-1.05e+06,-5.35e+05
LotFrontage,39.6922,45.632,0.870,0.385,-49.820,129.204
OverallQual,1.601e+04,1145.284,13.979,0.000,1.38e+04,1.83e+04
YearBuilt,102.8089,47.583,2.161,0.031,9.469,196.149
YearRemodAdd,273.4747,60.333,4.533,0.000,155.124,391.826
MasVnrArea,10.0123,5.977,1.675,0.094,-1.712,21.737
BsmtFinSF1,13.8647,2.788,4.974,0.000,8.396,19.333
TotalBsmtSF,7.2100,4.078,1.768,0.077,-0.790,15.210
1stFlrSF,22.9271,19.523,1.174,0.240,-15.369,61.224

0,1,2,3
Omnibus:,698.803,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,100562.401
Skew:,-1.171,Prob(JB):,0.0
Kurtosis:,43.591,Cond. No.,536000.0


In [76]:
train_opt =  train_opt.drop('GarageArea', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,282.9
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17295.0
No. Observations:,1460,AIC:,34640.0
Df Residuals:,1436,BIC:,34770.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.876e+05,1.3e+05,-6.044,0.000,-1.04e+06,-5.32e+05
LotFrontage,42.5540,45.270,0.940,0.347,-46.248,131.356
OverallQual,1.601e+04,1144.986,13.983,0.000,1.38e+04,1.83e+04
YearBuilt,101.9224,47.539,2.144,0.032,8.670,195.175
YearRemodAdd,272.4815,60.286,4.520,0.000,154.224,390.740
MasVnrArea,10.1384,5.970,1.698,0.090,-1.573,21.850
BsmtFinSF1,14.0321,2.767,5.071,0.000,8.604,19.461
TotalBsmtSF,7.2571,4.076,1.780,0.075,-0.739,15.253
1stFlrSF,22.9094,19.518,1.174,0.241,-15.377,61.196

0,1,2,3
Omnibus:,694.189,Durbin-Watson:,1.947
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98778.229
Skew:,-1.159,Prob(JB):,0.0
Kurtosis:,43.229,Cond. No.,530000.0


In [77]:
train_opt =  train_opt.drop('OpenPorchSF', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,295.9
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17296.0
No. Observations:,1460,AIC:,34640.0
Df Residuals:,1437,BIC:,34760.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.916e+05,1.3e+05,-6.083,0.000,-1.05e+06,-5.36e+05
LotFrontage,43.1310,45.252,0.953,0.341,-45.637,131.898
OverallQual,1.605e+04,1143.435,14.034,0.000,1.38e+04,1.83e+04
YearBuilt,101.9119,47.529,2.144,0.032,8.678,195.146
YearRemodAdd,274.4029,60.204,4.558,0.000,156.306,392.499
MasVnrArea,9.9816,5.964,1.674,0.094,-1.718,21.682
BsmtFinSF1,14.0172,2.767,5.066,0.000,8.590,19.444
TotalBsmtSF,7.4984,4.059,1.847,0.065,-0.464,15.460
1stFlrSF,22.8432,19.514,1.171,0.242,-15.435,61.122

0,1,2,3
Omnibus:,692.97,Durbin-Watson:,1.948
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97658.001
Skew:,-1.157,Prob(JB):,0.0
Kurtosis:,43.0,Cond. No.,530000.0


In [78]:
train_opt =  train_opt.drop('Exterior1st_values_14.0', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.816
Method:,Least Squares,F-statistic:,310.0
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17296.0
No. Observations:,1460,AIC:,34640.0
Df Residuals:,1438,BIC:,34750.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7.848e+05,1.3e+05,-6.044,0.000,-1.04e+06,-5.3e+05
LotFrontage,44.7259,45.203,0.989,0.323,-43.944,133.396
OverallQual,1.609e+04,1141.837,14.094,0.000,1.39e+04,1.83e+04
YearBuilt,99.1491,47.397,2.092,0.037,6.174,192.125
YearRemodAdd,273.5021,60.186,4.544,0.000,155.441,391.563
MasVnrArea,10.1310,5.961,1.700,0.089,-1.562,21.824
BsmtFinSF1,14.0639,2.766,5.085,0.000,8.639,19.489
TotalBsmtSF,7.3697,4.055,1.817,0.069,-0.585,15.324
1stFlrSF,22.8611,19.511,1.172,0.242,-15.412,61.135

0,1,2,3
Omnibus:,693.259,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97698.284
Skew:,-1.158,Prob(JB):,0.0
Kurtosis:,43.008,Cond. No.,529000.0


In [79]:
train_opt =  train_opt.drop('Exterior2nd_values_14.0', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,325.7
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:24,Log-Likelihood:,-17296.0
No. Observations:,1460,AIC:,34630.0
Df Residuals:,1439,BIC:,34750.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.053e+05,1.2e+05,-6.727,0.000,-1.04e+06,-5.7e+05
LotFrontage,45.6986,45.127,1.013,0.311,-42.822,134.219
OverallQual,1.608e+04,1141.124,14.092,0.000,1.38e+04,1.83e+04
YearBuilt,104.3522,45.637,2.287,0.022,14.831,193.873
YearRemodAdd,278.8182,58.741,4.747,0.000,163.590,394.046
MasVnrArea,10.0824,5.958,1.692,0.091,-1.605,21.769
BsmtFinSF1,13.8932,2.733,5.083,0.000,8.532,19.255
TotalBsmtSF,7.5275,4.036,1.865,0.062,-0.389,15.444
1stFlrSF,22.6071,19.496,1.160,0.246,-15.636,60.850

0,1,2,3
Omnibus:,694.74,Durbin-Watson:,1.95
Prob(Omnibus):,0.0,Jarque-Bera (JB):,97613.797
Skew:,-1.164,Prob(JB):,0.0
Kurtosis:,42.99,Cond. No.,487000.0


In [80]:
train_opt =  train_opt.drop('2ndFlrSF', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,342.8
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:25,Log-Likelihood:,-17296.0
No. Observations:,1460,AIC:,34630.0
Df Residuals:,1440,BIC:,34740.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.149e+05,1.19e+05,-6.838,0.000,-1.05e+06,-5.81e+05
LotFrontage,45.0310,45.115,0.998,0.318,-43.468,133.530
OverallQual,1.608e+04,1141.007,14.097,0.000,1.38e+04,1.83e+04
YearBuilt,109.3000,45.259,2.415,0.016,20.520,198.080
YearRemodAdd,278.8496,58.736,4.748,0.000,163.633,394.066
MasVnrArea,10.3612,5.948,1.742,0.082,-1.307,22.029
BsmtFinSF1,13.9912,2.730,5.124,0.000,8.635,19.347
TotalBsmtSF,7.3969,4.032,1.834,0.067,-0.513,15.307
1stFlrSF,6.5037,4.518,1.440,0.150,-2.359,15.366

0,1,2,3
Omnibus:,688.938,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,95953.803
Skew:,-1.147,Prob(JB):,0.0
Kurtosis:,42.649,Cond. No.,483000.0


In [81]:
train_opt =  train_opt.drop('LotFrontage', axis = 1)
regressor_OLS = sm.OLS(endog = y, exog = train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.819
Model:,OLS,Adj. R-squared:,0.817
Method:,Least Squares,F-statistic:,361.8
Date:,"Mon, 05 Feb 2018",Prob (F-statistic):,0.0
Time:,15:44:25,Log-Likelihood:,-17297.0
No. Observations:,1460,AIC:,34630.0
Df Residuals:,1441,BIC:,34730.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.088e+05,1.19e+05,-6.796,0.000,-1.04e+06,-5.75e+05
OverallQual,1.603e+04,1139.844,14.066,0.000,1.38e+04,1.83e+04
YearBuilt,109.5348,45.258,2.420,0.016,20.756,198.313
YearRemodAdd,276.2698,58.679,4.708,0.000,161.165,391.375
MasVnrArea,10.1451,5.944,1.707,0.088,-1.515,21.805
BsmtFinSF1,14.1683,2.725,5.200,0.000,8.824,19.513
TotalBsmtSF,7.6229,4.026,1.894,0.058,-0.274,15.520
1stFlrSF,7.0746,4.481,1.579,0.115,-1.716,15.865
GrLivArea,36.8778,3.931,9.382,0.000,29.167,44.589

0,1,2,3
Omnibus:,667.386,Durbin-Watson:,1.949
Prob(Omnibus):,0.0,Jarque-Bera (JB):,89492.997
Skew:,-1.085,Prob(JB):,0.0
Kurtosis:,41.294,Cond. No.,482000.0


In [82]:
columns = train_opt.columns.tolist()

In [83]:
columns.remove('const')
train = train[columns]
test = test[columns]

In [84]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
train_poly = poly_reg.fit_transform(train)
test_poly = poly_reg.transform(test)

In [85]:
# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(train_poly, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [86]:
y_kaggle = regressor.predict(test_poly)

In [87]:
submission = pd.DataFrame({
        "Id": df_ids,
        "SalePrice": y_kaggle
    })

submission.to_csv('submission.csv', index=False)

In [88]:
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(pd.concat(objs=[train, y], axis = 1), x_vars=columns, y_vars='SalePrice', size=7, aspect=0.7)

SyntaxError: invalid syntax (<ipython-input-88-25412c07e1bc>, line 1)