In [1]:
import pandas as pd

Problem Statement: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
Dataset: https://github.com/abulbasar/data/tree/master/kaggle-houseprice

In [2]:
df = pd.read_csv("/data/kaggle/data_combined_cleaned.csv")
df = df.drop(columns=["Id"])
df = df[~df.SalesPrice.isnull()]
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalesPrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,,,,0,2,2008,WD,Normal,208500.0
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,,,,0,5,2007,WD,Normal,181500.0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,,,,0,9,2008,WD,Normal,223500.0
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,,,,0,12,2008,WD,Normal,250000.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 79 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1460 no

# Exercise:
- Build a model to predict the SalesPrice using other columns as features. Split the dataset (1460 records) into training and test sets with 70/30 ratio. Take log of SalesPrice for prediction to reduce the impact of the outliers.
- What is the R2 score and rmse score? 
- Fit Lasso model. Test a few alpha values (>0) and see which gives a good r2 score. 

In [4]:
import numpy as np
from sklearn import preprocessing, model_selection, linear_model, metrics, pipeline

In [5]:
target = "SalesPrice"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True)
columns = X.columns
X = X.values.astype("float")
y = np.log(df[target])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:"
    , np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("test rmse:"
      , np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
est = pipe.steps[-1][-1]
pd.DataFrame({"feature": columns, "coefficient": est.coef_})

training r2 0.9494518297088844
test r2 -3.0715943247509893e+22
training rmse: 0.08606817747710854
test rmse: 76190488351.17424


Unnamed: 0,feature,coefficient
0,MSSubClass,1.248159e-02
1,LotFrontage,1.963124e-03
2,LotArea,3.151273e-02
3,OverallQual,5.675519e-02
4,OverallCond,3.468973e-02
5,YearBuilt,5.024337e-02
6,YearRemodAdd,1.540301e-02
7,MasVnrArea,-3.995381e-03
8,BsmtFinSF1,4.530257e+10
9,BsmtFinSF2,1.612199e+10


In [6]:
target = "SalesPrice"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True)
columns = X.columns
X = X.values.astype("float")
y = np.log(df[target])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha=0.01, random_state=1))
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:"
    , np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("test rmse:"
      , np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
est = pipe.steps[-1][-1]


summary = pd.DataFrame({"feature": columns, "coefficient": est.coef_})
summary["coef_mag"] = np.abs(summary.coefficient)
summary = summary.sort_values("coef_mag", ascending = False)
summary = summary[summary.coef_mag>0]
summary

training r2 0.8940480673803641
test r2 0.8785389987905549
training rmse: 0.12460754170603186
test rmse: 0.15150859029885974


Unnamed: 0,feature,coefficient,coef_mag
3,OverallQual,0.104135,0.104135
15,GrLivArea,0.095563,0.095563
25,GarageCars,0.055896,0.055896
5,YearBuilt,0.029465,0.029465
39,MSZoning_RM,-0.026432,0.026432
4,OverallCond,0.023309,0.023309
6,YearRemodAdd,0.021713,0.021713
70,Neighborhood_NridgHt,0.020148,0.020148
60,Neighborhood_Crawfor,0.019489,0.019489
16,BsmtFullBath,0.01803,0.01803


In [7]:
summary = pd.DataFrame({"feature": columns, "coefficient": est.coef_})
summary["coef_mag"] = np.abs(summary.coefficient)
summary = summary.sort_values("coef_mag", ascending = False)
summary = summary[summary.coef_mag>0]
summary

Unnamed: 0,feature,coefficient,coef_mag
3,OverallQual,0.104135,0.104135
15,GrLivArea,0.095563,0.095563
25,GarageCars,0.055896,0.055896
5,YearBuilt,0.029465,0.029465
39,MSZoning_RM,-0.026432,0.026432
4,OverallCond,0.023309,0.023309
6,YearRemodAdd,0.021713,0.021713
70,Neighborhood_NridgHt,0.020148,0.020148
60,Neighborhood_Crawfor,0.019489,0.019489
16,BsmtFullBath,0.01803,0.01803


In [8]:
target = "SalesPrice"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True)
columns = X.columns
X = X.values.astype("float")
y = np.log(df[target])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Ridge(alpha=40, random_state=1))
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:"
    , np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("test rmse:"
      , np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
est = pipe.steps[-1][-1]


summary = pd.DataFrame({"feature": columns, "coefficient": est.coef_})
summary["coef_mag"] = np.abs(summary.coefficient)
summary = summary.sort_values("coef_mag", ascending = False)
summary = summary[summary.coef_mag>0]
summary

training r2 0.9388343768471614
test r2 0.8776469231977766
training rmse: 0.09467680401312785
test rmse: 0.15206395314993107


Unnamed: 0,feature,coefficient,coef_mag
3,OverallQual,0.060435,0.060435
110,RoofMatl_CompShg,0.058395,0.058395
15,GrLivArea,0.053418,0.053418
32,PoolArea,0.040313,0.040313
114,RoofMatl_Tar&Grv,0.038677,0.038677
12,1stFlrSF,0.036038,0.036038
116,RoofMatl_WdShngl,0.034974,0.034974
4,OverallCond,0.031776,0.031776
13,2ndFlrSF,0.031732,0.031732
70,Neighborhood_NridgHt,0.029292,0.029292


In [9]:
from sklearn import feature_selection

In [18]:
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
_, pvals = feature_selection.f_regression(X_train_std, y_train)
summary = pd.DataFrame({"feature": columns, "pval": pvals})
summary = summary.sort_values("pval")
summary = summary[summary.pval < 0.01] 
# pval < 0.01 are considered significant
# closer to 0 is more significant features
summary.iloc[:10,:]

  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


Unnamed: 0,feature,pval
3,OverallQual,1.54361e-232
15,GrLivArea,1.502663e-147
25,GarageCars,8.980028000000001e-140
26,GarageArea,9.673012e-121
18,FullBath,7.435435000000001e-106
11,TotalBsmtSF,4.707153e-98
151,ExterQual_TA,1.996594e-96
12,1stFlrSF,1.957045e-95
5,YearBuilt,2.946855e-85
6,YearRemodAdd,3.840149e-80


In [21]:
target = "SalesPrice"
X = df.drop(columns=[target])
X = pd.get_dummies(X, drop_first=True)
columns = X.columns
X = X.values.astype("float")
y = np.log(df[target])

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
                           test_size = 0.3, random_state = 1)

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1
                                    , include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha=0.07, random_state=1))
])


pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training r2", metrics.r2_score(y_train, y_train_pred))
print("test r2", metrics.r2_score(y_test, y_test_pred))

print("training rmse:"
    , np.sqrt(metrics.mean_squared_error(y_train, y_train_pred)))
print("test rmse:"
      , np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))
est = pipe.steps[-1][-1]


summary = pd.DataFrame({"feature": columns, "coefficient": est.coef_})
summary["coef_mag"] = np.abs(summary.coefficient)
summary = summary.sort_values("coef_mag", ascending = False)
summary = summary[summary.coef_mag>0]
summary

training r2 0.7395929485859631
test r2 0.746971064732701
training rmse: 0.19535127802592958
test rmse: 0.2186774107558645


Unnamed: 0,feature,coefficient,coef_mag
3,OverallQual,0.140629,0.140629
15,GrLivArea,0.069171,0.069171
25,GarageCars,0.055262,0.055262
11,TotalBsmtSF,0.014343,0.014343
5,YearBuilt,0.01273,0.01273
6,YearRemodAdd,0.009382,0.009382
12,1stFlrSF,0.006673,0.006673
39,MSZoning_RM,-0.005534,0.005534
23,Fireplaces,0.000774,0.000774
