In [183]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report,confusion_matrix,  mean_squared_error
from sklearn.model_selection import train_test_split


## Get the Dataset

In [100]:
data=pd.read_csv('/Users/jafa/Downloads/Housing Pricing.csv')

In [101]:
data.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500


In [102]:
data.shape

(1460, 77)

In [103]:
data.drop('Id',inplace=True, axis=1)

In [104]:
data.head(1)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500


In [105]:
original_features=list(data.columns)
len(original_features)

76

In [106]:
features_missing_values=list(data.columns[data.isna().any()])
len(features_missing_values)

15

In [149]:
features_missing_values

['LotFrontage',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond']

In [107]:
len(data)

1460

## Fill in Missing Features

In [108]:
bfill,mean_fill = [],[]

In [109]:
for feature in features_missing_values:
  print(feature,' ',data[feature].isna().sum()/len(data)*100,' ',data[feature].dtype)
  if data[feature].dtype=='float64':
    mean_fill.append(feature)
  else:
    bfill.append(feature)

LotFrontage   17.73972602739726   float64
MasVnrType   0.547945205479452   object
MasVnrArea   0.547945205479452   float64
BsmtQual   2.5342465753424657   object
BsmtCond   2.5342465753424657   object
BsmtExposure   2.6027397260273974   object
BsmtFinType1   2.5342465753424657   object
BsmtFinType2   2.6027397260273974   object
Electrical   0.0684931506849315   object
FireplaceQu   47.26027397260274   object
GarageType   5.5479452054794525   object
GarageYrBlt   5.5479452054794525   float64
GarageFinish   5.5479452054794525   object
GarageQual   5.5479452054794525   object
GarageCond   5.5479452054794525   object


In [110]:
mean_fill

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [111]:
for feature in mean_fill:
  data[feature].fillna(data[feature].mean(),inplace=True)

In [112]:
for feature in bfill:
  data[feature].fillna(method='bfill',inplace=True)

In [113]:
data.columns[data.isna().any()]

Index(['FireplaceQu'], dtype='object')

In [114]:
data['FireplaceQu'].isna().sum()

2

In [115]:
data['FireplaceQu'].fillna(method='ffill',inplace=True)

# Encode the Dataset

In [116]:
categorical_features=list(data.select_dtypes(include=['object']).columns)
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [117]:
numerical=list(set(original_features)-set(categorical_features))
numerical

['OverallCond',
 'YearBuilt',
 'HalfBath',
 'BedroomAbvGr',
 'GrLivArea',
 'MoSold',
 'OpenPorchSF',
 'GarageArea',
 '3SsnPorch',
 'BsmtFinSF1',
 'TotRmsAbvGrd',
 '1stFlrSF',
 'PoolArea',
 'OverallQual',
 'MSSubClass',
 'LotArea',
 'FullBath',
 'ScreenPorch',
 'BsmtUnfSF',
 'LotFrontage',
 'WoodDeckSF',
 'EnclosedPorch',
 'YearRemodAdd',
 'BsmtFinSF2',
 'YrSold',
 'BsmtHalfBath',
 'SalePrice',
 'MiscVal',
 'MasVnrArea',
 'Fireplaces',
 'GarageYrBlt',
 'BsmtFullBath',
 'GarageCars',
 '2ndFlrSF',
 'KitchenAbvGr',
 'LowQualFinSF',
 'TotalBsmtSF']

In [118]:
categorical_features

['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'SaleType',
 'SaleCondition']

In [119]:
nominal=['MSZoning','LandContour','Neighborhood']
ordinal=list(set(categorical_features)-set(nominal))

In [120]:
target=['SalePrice']

In [121]:
df_nominal=pd.get_dummies(data[nominal])

In [122]:
for feature in ordinal:
  data[feature]=data[feature].astype('category').cat.codes

df_ordinal=data[ordinal]

In [123]:
data[numerical]

Unnamed: 0,OverallCond,YearBuilt,HalfBath,BedroomAbvGr,GrLivArea,MoSold,OpenPorchSF,GarageArea,3SsnPorch,BsmtFinSF1,...,MiscVal,MasVnrArea,Fireplaces,GarageYrBlt,BsmtFullBath,GarageCars,2ndFlrSF,KitchenAbvGr,LowQualFinSF,TotalBsmtSF
0,5,2003,1,3,1710,2,61,548,0,706,...,0,196.0,0,2003.0,1,2,854,1,0,856
1,8,1976,0,3,1262,5,0,460,0,978,...,0,0.0,1,1976.0,0,2,0,1,0,1262
2,5,2001,1,3,1786,9,42,608,0,486,...,0,162.0,1,2001.0,1,2,866,1,0,920
3,5,1915,0,3,1717,2,35,642,0,216,...,0,0.0,1,1998.0,1,3,756,1,0,756
4,5,2000,1,4,2198,12,84,836,0,655,...,0,350.0,1,2000.0,1,3,1053,1,0,1145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,5,1999,1,3,1647,8,40,460,0,0,...,0,0.0,1,1999.0,0,2,694,1,0,953
1456,6,1978,0,3,2073,2,0,500,0,790,...,0,119.0,2,1978.0,1,2,0,1,0,1542
1457,9,1941,0,4,2340,5,60,252,0,275,...,2500,0.0,2,1941.0,0,1,1152,1,0,1152
1458,6,1950,0,2,1078,4,0,240,0,49,...,0,0.0,0,1950.0,1,1,0,1,0,1078


In [124]:
new_data=pd.concat([df_nominal,df_ordinal,data[numerical]],axis=1)

In [125]:
new_data.shape

(1460, 107)

# Standardizing the dataset

In [126]:
new_data.drop('SalePrice',axis=1,inplace=True)

In [127]:
X=new_data.to_numpy()

In [152]:
X

array([[-0.08304548, -0.21585871, -0.10526316, ..., -0.21145358,
        -0.12024172, -0.45930254],
       [-0.08304548, -0.21585871, -0.10526316, ..., -0.21145358,
        -0.12024172,  0.46646492],
       [-0.08304548, -0.21585871, -0.10526316, ..., -0.21145358,
        -0.12024172, -0.31336875],
       ...,
       [-0.08304548, -0.21585871, -0.10526316, ..., -0.21145358,
        -0.12024172,  0.21564122],
       [-0.08304548, -0.21585871, -0.10526316, ..., -0.21145358,
        -0.12024172,  0.04690528],
       [-0.08304548, -0.21585871, -0.10526316, ..., -0.21145358,
        -0.12024172,  0.45278362]])

In [129]:
X=StandardScaler().fit_transform(X)

In [130]:
X[0]

array([-0.08304548, -0.21585871, -0.10526316,  0.51813339, -0.41895507,
       -0.21235968, -0.18831089, -0.15899968,  0.33712564, -0.10854037,
       -0.03703704, -0.10526316, -0.20339487, -0.1398323 ,  2.95522137,
       -0.19025216, -0.27116307, -0.23917551, -0.16124951, -0.10854037,
       -0.1863522 , -0.42683279, -0.07875671, -0.22941573, -0.16998114,
       -0.23595776, -0.28963792, -0.13199092, -0.23106504, -0.20521398,
       -0.25018188, -0.13199092, -0.16347148, -0.08712888, -0.03628912,
        0.2085023 ,  0.25597761,  0.74309239,  1.11993319, -0.23012211,
        1.02668924, -0.03174026, -0.22571613,  0.22403433, -0.77797579,
       -0.49151573,  0.64046232, -0.40936914,  0.28974476, -1.23804229,
       -0.12304604, -0.40779461,  0.26381257,  0.83559903,  0.30941909,
       -0.29055229,  0.303692  ,  0.18797343,  0.28780036, -0.88465767,
        0.06423821,  0.75162623,  0.31386709,  0.60466978, -0.41169079,
       -0.71775144,  0.75073056, -0.02618016, -0.12579688,  0.36

In [131]:
y=data[target]

In [132]:
y=y.to_numpy()

In [133]:
y.shape

(1460, 1)

In [134]:
y

array([[208500],
       [181500],
       [223500],
       ...,
       [266500],
       [142125],
       [147500]])

In [135]:
y=StandardScaler().fit_transform(y)

In [136]:
y

array([[ 0.34727322],
       [ 0.00728832],
       [ 0.53615372],
       ...,
       [ 1.07761115],
       [-0.48852299],
       [-0.42084081]])

In [137]:
X.shape

(1460, 106)

In [211]:
y=data[target]
y

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000
...,...
1455,175000
1456,210000
1457,266500
1458,142125


# Feature Selection Using L1

In [212]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [None]:
reg = LogisticRegression()
reg.fit(X_train,y_train)

  return f(*args, **kwargs)


In [None]:
print('Learnt coefficients:', reg.coef_)
print('Learnt intercept:', reg.intercept_)


In [197]:
y_hat=reg.predict(X_test)

In [198]:
len(y_hat)

438

In [199]:
len(y_test)

438

In [203]:
mean_squared_error(y_test,y_hat, squared=False)

39221.14651445418