# Loading libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Load data

In [4]:
data = pd.read_csv("Data/DATA_Housing_Prices.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Defining X, y

In [5]:
X = data.drop(columns=['Id','SalePrice'], axis = 1)
y = np.log(data['SalePrice'])

# Data splitting

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

In [7]:
X_train.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1168.0,951.0,1168.0,1168.0,1168.0,1168.0,1168.0,1162.0,1168.0,1168.0,...,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0,1168.0
mean,56.849315,70.343849,10689.642123,6.121575,5.58476,1970.965753,1984.89726,103.771945,446.023973,45.152397,...,476.273973,95.946918,49.578767,21.839041,3.8125,15.407534,2.955479,51.267123,6.356164,2007.818493
std,42.531862,24.897021,10759.366198,1.367619,1.116062,30.675495,20.733955,173.032238,459.070977,158.217499,...,211.095373,129.685939,69.43358,62.083227,31.519664,55.881148,41.648504,553.039684,2.670707,1.322639
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7587.25,5.0,5.0,1953.0,1966.0,0.0,0.0,0.0,...,341.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,70.0,9600.0,6.0,5.0,1972.0,1994.0,0.0,384.5,0.0,...,482.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11700.0,7.0,6.0,2001.0,2004.0,166.0,721.0,0.0,...,576.0,168.0,74.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1378.0,5644.0,1127.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


# Variance threshold method

Univariate method

In [18]:
from sklearn.feature_selection import VarianceThreshold # It only works with numerical features


X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

#display(X_train)
print("Initial number of numerical columns: ",X_train.shape)
print()


selector = VarianceThreshold(threshold=100) # Default threshold value is 0
# Features with a training-set variance lower than this threshold will be removed.
selector.fit(X_train)

kept_features_indexes = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features_indexes].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final number of numerical columns: ",X_train.shape)
print()
X_train

Initial number of numerical columns:  (1022, 36)

Final number of numerical columns:  (1022, 23)



Unnamed: 0,MSSubClass,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,...,GrLivArea,GarageYrBlt,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal
0,20.0,80.0,10400.0,1970.0,1970.0,288.0,0.0,0.0,1304.0,1304.0,...,1682.0,1970.0,530.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0
1,180.0,35.0,3675.0,2005.0,2005.0,80.0,547.0,0.0,0.0,547.0,...,1072.0,2005.0,525.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0
2,60.0,72.0,8640.0,2009.0,2009.0,0.0,24.0,0.0,732.0,756.0,...,1547.0,2009.0,614.0,169.0,45.0,0.0,0.0,0.0,0.0,0.0
3,20.0,84.0,11670.0,2006.0,2006.0,302.0,0.0,0.0,1905.0,1905.0,...,1905.0,2006.0,788.0,0.0,191.0,0.0,0.0,0.0,0.0,0.0
4,60.0,43.0,10667.0,1996.0,1996.0,0.0,385.0,344.0,70.0,799.0,...,1661.0,1996.0,550.0,158.0,61.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,20.0,78.0,9317.0,2006.0,2006.0,0.0,24.0,0.0,1290.0,1314.0,...,1314.0,2006.0,440.0,0.0,22.0,0.0,0.0,0.0,0.0,0.0
1018,50.0,65.0,7804.0,1928.0,1950.0,0.0,622.0,0.0,500.0,1122.0,...,1981.0,1981.0,576.0,431.0,44.0,0.0,0.0,0.0,0.0,0.0
1019,20.0,60.0,8172.0,1955.0,1990.0,0.0,167.0,0.0,697.0,864.0,...,864.0,1957.0,572.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1020,50.0,55.0,7642.0,1918.0,1998.0,0.0,0.0,0.0,912.0,912.0,...,1426.0,1925.0,216.0,0.0,240.0,0.0,0.0,0.0,0.0,0.0


# Correlation matrix

Univariate method

In [19]:
c

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,1.0,0.011156,0.010601,0.033226,0.028365,0.012609,0.012713,0.021998,0.050298,0.005024,...,0.029643,0.000477,0.002889,0.046635,0.00133,0.057044,0.006242,0.021172,0.000712,0.021917
MSSubClass,0.011156,1.0,0.386347,0.139781,0.032628,0.059316,0.02785,0.040581,0.022936,0.069836,...,0.012579,0.0061,0.012037,0.043825,0.02603,0.008283,0.007683,0.013585,0.021407,0.084284
LotFrontage,0.010601,0.386347,1.0,0.426095,0.251646,0.059213,0.123349,0.088866,0.193458,0.233633,...,0.088521,0.151972,0.0107,0.070029,0.041383,0.206167,0.003368,0.0112,0.00745,0.351799
LotArea,0.033226,0.139781,0.426095,1.0,0.105806,0.005636,0.014228,0.013788,0.10416,0.214103,...,0.171698,0.084774,0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,0.014261,0.263843
OverallQual,0.028365,0.032628,0.251646,0.105806,1.0,0.091932,0.572323,0.550684,0.411876,0.239666,...,0.238923,0.308819,0.113937,0.030371,0.064886,0.065166,0.031406,0.070815,0.027347,0.790982
OverallCond,0.012609,0.059316,0.059213,0.005636,0.091932,1.0,0.375983,0.073741,0.128101,0.046231,...,0.003334,0.032589,0.070356,0.025504,0.054811,0.001985,0.068777,0.003511,0.04395,0.077856
YearBuilt,0.012713,0.02785,0.123349,0.014228,0.572323,0.375983,1.0,0.592855,0.315707,0.249503,...,0.22488,0.188686,0.387268,0.031355,0.050364,0.00495,0.034383,0.012398,0.013618,0.522897
YearRemodAdd,0.021998,0.040581,0.088866,0.013788,0.550684,0.073741,0.592855,1.0,0.179618,0.128451,...,0.205726,0.226298,0.193919,0.045286,0.03874,0.005829,0.010286,0.02149,0.035743,0.507101
MasVnrArea,0.050298,0.022936,0.193458,0.10416,0.411876,0.128101,0.315707,0.179618,1.0,0.264736,...,0.159718,0.125703,0.110204,0.018796,0.061466,0.011723,0.029815,0.005965,0.008201,0.477493
BsmtFinSF1,0.005024,0.069836,0.233633,0.214103,0.239666,0.046231,0.249503,0.128451,0.264736,1.0,...,0.204306,0.111761,0.102303,0.026451,0.062021,0.140491,0.003571,0.015727,0.014359,0.38642


In [22]:
import seaborn as sns
import matplotlib.pyplot as plt

c = abs(data.corr())
#c

#fig, ax = plt.subplots(figsize=(14,14))
#sns.heatmap(c, annot=True);

#c['SalePrice']
c_last = c['SalePrice'].sort_values(ascending=False)
#c_last
c_thr = .3
cols_to_keep = list(c_last[c_last > c_thr].index)[1:] + [list(c_last[c_last > c_thr].index)[0]]
print(cols_to_keep)

data[cols_to_keep]

['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MasVnrArea', 'Fireplaces', 'BsmtFinSF1', 'LotFrontage', 'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'SalePrice']


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,GarageYrBlt,MasVnrArea,Fireplaces,BsmtFinSF1,LotFrontage,WoodDeckSF,2ndFlrSF,OpenPorchSF,SalePrice
0,7,1710,2,548,856,856,2,8,2003,2003,2003.0,196.0,0,706,65.0,0,854,61,208500
1,6,1262,2,460,1262,1262,2,6,1976,1976,1976.0,0.0,1,978,80.0,298,0,0,181500
2,7,1786,2,608,920,920,2,6,2001,2002,2001.0,162.0,1,486,68.0,0,866,42,223500
3,7,1717,3,642,756,961,1,7,1915,1970,1998.0,0.0,1,216,60.0,0,756,35,140000
4,8,2198,3,836,1145,1145,2,9,2000,2000,2000.0,350.0,1,655,84.0,192,1053,84,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,1647,2,460,953,953,2,7,1999,2000,1999.0,0.0,1,0,62.0,0,694,40,175000
1456,6,2073,2,500,1542,2073,2,7,1978,1988,1978.0,119.0,2,790,85.0,349,0,0,210000
1457,7,2340,1,252,1152,1188,2,9,1941,2006,1941.0,0.0,2,275,66.0,0,1152,60,266500
1458,5,1078,1,240,1078,1078,1,5,1950,1996,1950.0,0.0,0,49,68.0,366,0,0,142125


# Recursive feature elimination

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test  = pd.DataFrame(X_test, columns=X.columns)

#X_train.isna().sum()
nulls = pd.DataFrame(X_train.isna().sum()).reset_index()
#nulls.head()
nulls.columns = ['Column','nas']
#nulls.head()
#nulls[nulls['nas'] > 0].head()
cols_to_drop = nulls[nulls['nas'] > 0]['Column'] # Too drastic, but made on pourpose for quick filtering (don't do this in production!!)

X_train.drop(columns=cols_to_drop, axis=1, inplace = True)
X_test.drop(columns=cols_to_drop, axis=1, inplace = True)

#display(X_train)

lm = LinearRegression()

selector = RFE(lm, n_features_to_select= 8, step = 1, verbose = 1) # Step is how many features to add or drop everytime
selector.fit(X_train, y_train)

kept_features = selector.get_support(indices = True) #returns an array of integers corresponding to nonremoved features
kept_features = list(X_train.iloc[:,kept_features].columns)

X_train = selector.transform(X_train)
X_test  = selector.transform(X_test)

X_train = pd.DataFrame(X_train, columns=kept_features)
X_test  = pd.DataFrame(X_test, columns=kept_features)

print("Final selected features: ")
display(X_train)


Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.
Fitting estimator with 24 features.
Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Final selected features: 


Unnamed: 0,OverallQual,BsmtFullBath,BsmtHalfBath,FullBath,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars
0,5,1,0,1,1,5,0,1
1,6,0,0,2,1,7,1,2
2,5,0,0,1,1,4,0,0
3,5,1,0,1,1,7,2,1
4,5,0,0,2,1,6,1,1
...,...,...,...,...,...,...,...,...
1163,6,0,0,2,1,6,1,2
1164,4,1,0,2,1,7,2,2
1165,5,1,0,1,1,5,0,2
1166,7,0,0,1,1,7,1,1


## Embedded Methods

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

X_train = X_train.select_dtypes(include=np.number)
X_test  = X_test.select_dtypes(include=np.number)


from sklearn.linear_model import Lasso,Ridge,ElasticNet, LinearRegression
model=Lasso()

model.fit(X_train, y_train)
print(f"{model.__class__.__name__}: Train -> {model.score(X_train, y_train)}, Test -> {model.score(X_test, y_test)}")