- Adding Dummy Variables/Handle Dummy Variable Trap
- Scale Data
- Stats OLS Model 
- Lasso
- Ridge

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from statsmodels.api import OLS
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression, Lasso, Ridge

from scipy import stats

In [2]:
data = pd.read_pickle('data/train_no_missing_data.pkl')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Id             1460 non-null   int64   
 1   MSSubClass     1460 non-null   int64   
 2   MSZoning       1460 non-null   category
 3   LotFrontage    1460 non-null   float64 
 4   LotArea        1460 non-null   int64   
 5   Street         1460 non-null   category
 6   Alley          1460 non-null   category
 7   LotShape       1460 non-null   category
 8   LandContour    1460 non-null   category
 9   Utilities      1460 non-null   category
 10  LotConfig      1460 non-null   category
 11  LandSlope      1460 non-null   category
 12  Neighborhood   1460 non-null   category
 13  Condition1     1460 non-null   category
 14  Condition2     1460 non-null   category
 15  BldgType       1460 non-null   category
 16  HouseStyle     1460 non-null   category
 17  OverallQual    1460 non-null   in

In [3]:
X = data.iloc[:, 1:-1]  # dropping Id column as its not a predictor 
y = data.iloc[:, -1].values

X.shape, y.shape

((1460, 79), (1460,))

In [4]:
cat_columns = X.select_dtypes('category').columns
cat_columns , len(cat_columns)

(Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
        'SaleType', 'SaleCondition'],
       dtype='object'),
 43)

## Adding Dummy Variables/Handle Dummy Variable Trap

In [5]:
def add_dummy(X, cat_columns):
    dropped_dummy_name =[]

    for col in cat_columns:
        col_dummy = pd.get_dummies(X[col], prefix=col)

        # drop last column to avoid dummy variable trap 
        dropped_dummy_name.append(col_dummy.columns[-1])
        #col_dummy = col_dummy.iloc[:,0:-1]            

        X = X.join(col_dummy)
        
    return X, dropped_dummy_name

In [6]:
X_new , dropped_dummy_name = add_dummy(X, cat_columns)
len(dropped_dummy_name)  

43

In [7]:
len(X_new.columns)

345

In [8]:
col_to_be_removed = []
col_to_be_removed.append(cat_columns)
col_to_be_removed.append(dropped_dummy_name)
col_to_be_removed = np.concatenate(col_to_be_removed)

len(col_to_be_removed)


86

In [9]:
# drop the columns for which dummy variables have been added
final_column = [col for col in X_new.columns if col not in col_to_be_removed]
len(final_column)

259

In [10]:
X_final = X_new[final_column].values

In [11]:
len(X_final[0,:])

259

## Scale Data

In [12]:
sc = StandardScaler()
X_scaled_final = sc.fit_transform(X_final)

## Stats OLS Model 

## Test/Train split 

In [13]:
X_scaled_final.shape

(1460, 259)

In [14]:
#Adding X_0 =1
X_ff = np.hstack((np.ones((X_scaled_final.shape[0], 1)), X_scaled_final ))
X_ff.shape

(1460, 260)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_ff, y, test_size =0.2, random_state =0)

In [16]:
'''Fitting regressor on all variables'''
model = OLS(y_train,X_train)
regressor = model.fit()
regressor.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.945
Model:,OLS,Adj. R-squared:,0.931
Method:,Least Squares,F-statistic:,65.3
Date:,"Mon, 20 Jul 2020",Prob (F-statistic):,0.0
Time:,18:07:19,Log-Likelihood:,-13125.0
No. Observations:,1168,AIC:,26740.0
Df Residuals:,923,BIC:,27980.0
Df Model:,244,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.806e+05,625.787,288.563,0.000,1.79e+05,1.82e+05
x1,-2643.9701,3550.089,-0.745,0.457,-9611.152,4323.212
x2,2529.9099,1030.873,2.454,0.014,506.783,4553.037
x3,7342.6144,1090.344,6.734,0.000,5202.773,9482.456
x4,8571.9076,1451.461,5.906,0.000,5723.361,1.14e+04
x5,5799.9719,1006.571,5.762,0.000,3824.538,7775.405
x6,9773.6006,2546.201,3.839,0.000,4776.586,1.48e+04
x7,2303.2508,1228.226,1.875,0.061,-107.189,4713.691
x8,4917.7368,1062.599,4.628,0.000,2832.346,7003.128

0,1,2,3
Omnibus:,313.554,Durbin-Watson:,2.028
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7009.031
Skew:,0.684,Prob(JB):,0.0
Kurtosis:,14.923,Cond. No.,4.72e+16


In [17]:
y_pred = regressor.predict(X_test)

In [18]:
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE:  54381.78335187738


In [19]:
regressor.save('data/ols_model.pkl')

## LASSO Model 

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled_final, y, test_size =0.2, random_state =0)

In [21]:
lasso = Lasso(alpha= 0.5, max_iter=20000, random_state=0) # tune max_iter, alpha (regularization_parameter)

In [22]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=20000,
      normalize=False, positive=False, precompute=False, random_state=0,
      selection='cyclic', tol=0.0001, warm_start=False)

In [23]:
y_pred = lasso.predict(X_test)

In [24]:
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE:  59135.275288111596


## Ridge Model

In [25]:
ridge = Ridge(alpha =0.5, random_state=0, max_iter=20000)

In [26]:
ridge.fit(X_train, y_train)

Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=20000,
      normalize=False, random_state=0, solver='auto', tol=0.001)

In [27]:
y_pred = ridge.predict(X_test)

In [28]:
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

RMSE:  59356.2747811077


### check on test data

In [29]:
test_data = pd.read_csv('data/test.csv')
test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [30]:
col_with_na = [col for col in test_data.columns if test_data[col].isnull().sum() > 0]

for col in col_with_na:
    print('\n', col, ' <--> ', test_data[col].unique() , ' <--> ', test_data[col].isnull().sum())


 MSZoning  <-->  ['RH' 'RL' 'RM' 'FV' 'C (all)' nan]  <-->  4

 LotFrontage  <-->  [ 80.  81.  74.  78.  43.  75.  nan  63.  85.  70.  26.  21.  24. 102.
  94.  90.  79. 110. 105.  41. 100.  67.  60.  73.  92.  84.  39.  88.
  25.  30.  57.  68.  98. 120.  87. 119.  65.  56.  69.  50.  53.  52.
  51.  72.  86. 124.  44.  83.  64.  82.  38.  89.  35.  58.  66.  93.
  31.  76.  28.  61.  95. 129.  59.  77.  96.  47.  34. 117.  48.  62.
  42. 106. 112.  32. 115.  71.  45. 109. 113. 125. 101. 104. 108. 130.
 135.  36.  55. 136.  97.  91.  37.  22. 103.  99.  40. 123.  54. 107.
 150. 160. 195. 128.  33. 118. 134. 155. 126.  46. 149. 200. 121. 131.
 114.  49. 133. 140.]  <-->  227

 Alley  <-->  [nan 'Pave' 'Grvl']  <-->  1352

 Utilities  <-->  ['AllPub' nan]  <-->  2

 Exterior1st  <-->  ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'CemntBd' 'WdShing'
 'BrkFace' 'AsbShng' 'BrkComm' 'Stucco' 'AsphShn' nan 'CBlock']  <-->  1

 Exterior2nd  <-->  ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood

In [31]:
test_X = test_data.iloc[:, 1:]  # dropping Id column as its not a predictor 

test_X_new , _ = add_dummy(test_X, cat_columns)


In [32]:
test_X.shape, test_X_new.shape

((1459, 79), (1459, 313))

In [33]:
test_X

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [34]:

test_X_final = test_X_new[final_column].values

KeyError: "['Heating_Floor', 'Condition2_RRAn', 'BsmtCond_No Basement', 'Exterior1st_Stone', 'BsmtFinType2_No Basement', 'RoofMatl_Membran', 'GarageQual_No Garage', 'Alley_No Alley', 'Heating_OthW', 'MiscFeature_None', 'GarageCond_No Garage', 'RoofMatl_Roll', 'FireplaceQu_No Fireplace', 'PoolQC_Fa', 'GarageQual_Ex', 'Electrical_Mix', 'GarageFinish_No Garage', 'BsmtQual_No Basement', 'RoofMatl_ClyTile', 'Condition2_RRAe', 'Exterior1st_ImStucc', 'BsmtFinType1_No Basement', 'RoofMatl_Metal', 'Exterior2nd_Other', 'HouseStyle_2.5Fin'] not in index"