Exercise: Build a prediction model for EP based on all features. Find R2 and MSE score on test data and training data. 

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, preprocessing, metrics, pipeline, feature_selection

%matplotlib inline

In [2]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
del df["Id"]
df = df[~np.isnan(df["SalesPrice"])]
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalesPrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,,,,0,2,2008,WD,Normal,208500.0
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,,,,0,5,2007,WD,Normal,181500.0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,,,,0,9,2008,WD,Normal,223500.0
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,,,,0,12,2008,WD,Normal,250000.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 79 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1460 no

In [4]:
target = "SalesPrice"
X = df.copy()
del X[target]
y = np.log(df[target])
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                                test_size = 0.3, random_state = 1) 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training mse:", metrics.mean_squared_error(y_train, y_train_pred), 
      "\ntesting mse:", metrics.mean_squared_error(y_test, y_test_pred), 
      "\ntraining r2: ", pipe.score(X_train, y_train),
     "\ntesting r2: ", pipe.score(X_test, y_test))

training mse: 0.007406285347507605 
testing mse: 8.727385076639865e+19 
training r2:  0.9494616956035435 
testing r2:  -4.617920804724029e+20


In [5]:
target = "SalesPrice"
X = df.copy()
del X[target]
y = np.log(df[target])
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                                test_size = 0.3, random_state = 1) 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training mse:", metrics.mean_squared_error(y_train, y_train_pred), 
      "\ntesting mse:", metrics.mean_squared_error(y_test, y_test_pred), 
      "\ntraining r2: ", pipe.score(X_train, y_train),
     "\ntesting r2: ", pipe.score(X_test, y_test))

training mse: 6.996007534795658e-29 
testing mse: 0.038670365461414107 
training r2:  1.0 
testing r2:  0.7953835156551845


In [42]:
target = "SalesPrice"
X = df.copy()
del X[target]
y = np.log(df[target])
X = pd.get_dummies(X, drop_first=True)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
                                                test_size = 0.3, random_state = 123) 

pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=1, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.Lasso(alpha=0.1))
])

pipe.fit(X_train, y_train)
y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print("training mse:", metrics.mean_squared_error(y_train, y_train_pred), 
      "\ntesting mse:", metrics.mean_squared_error(y_test, y_test_pred), 
      "\ntraining r2: ", pipe.score(X_train, y_train),
     "\ntesting r2: ", pipe.score(X_test, y_test))

training mse: 0.05056161332405411 
testing mse: 0.045779442092841426 
training r2:  0.6944462432208631 
testing r2:  0.6839646644790083


In [22]:
pd.DataFrame({"feature": X.columns, "coef": pipe.steps[2][1].coef_})\
.sort_values("coef", ascending = False) 

Unnamed: 0,coef,feature
3,0.143914,OverallQual
15,0.053292,GrLivArea
25,0.048793,GarageCars
11,0.005756,TotalBsmtSF
12,0.001284,1stFlrSF
0,-0.000000,MSSubClass
177,-0.000000,BsmtFinType1_Rec
176,-0.000000,BsmtFinType1_None
175,-0.000000,BsmtFinType1_LwQ
174,0.000000,BsmtFinType1_GLQ


In [39]:
# Find significant variant variables

In [40]:
fvals, pvals = feature_selection.f_regression(X, y, center=True)

In [41]:
pd.DataFrame({"feature": X.columns, "F-statistic": fvals, 
        "pval": pvals}).sort_values("F-statistic", ascending=False)

Unnamed: 0,F-statistic,feature,pval
3,2930.795149,OverallQual,0.000000e+00
15,1408.119051,GrLivArea,3.060209e-216
25,1258.349043,GarageCars,3.093756e-199
26,1071.733084,GarageArea,1.106255e-176
11,873.710639,TotalBsmtSF,7.534551e-151
12,807.334244,1stFlrSF,1.074698e-141
151,802.726819,ExterQual_TA,4.749723e-141
18,798.099817,FullBath,2.118958e-140
5,764.780024,YearBuilt,1.103567e-135
24,696.939475,GarageYrBlt,7.405437e-126
