In [3]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
df = pd.read_csv("/data/kaggle/house-prices/data_combined_cleaned.csv")
df.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage         0
LotArea             0
Street              0
Alley               0
LotShape            0
LandContour         0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          0
MasVnrArea          0
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual            0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu         0
GarageType          0
GarageYrBlt         0
GarageFinish        0
GarageCars          0
GarageArea          0
GarageQual          0
GarageCond          0
PavedDrive

In [4]:
df = df[~df.SalesPrice.isnull()]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 80 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non

In [10]:
len(df.Id.unique())

AttributeError: 'DataFrame' object has no attribute 'Id'

In [9]:
del df["Id"]

In [5]:
df.SalesPrice = np.log(df.SalesPrice)

In [6]:
from sklearn.model_selection import train_test_split

In [13]:
df_dummy = pd.get_dummies(df.iloc[:, :-1], drop_first=True)

In [14]:
X = df_dummy.values
y = df.SalesPrice.values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import PolynomialFeatures

In [29]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [33]:
pipeline = Pipeline([
    #("scaler", StandardScaler()),
    ("lr", LinearRegression(normalize=False))
])
pipeline.fit(X_train, y_train)
print("Training R2:", pipeline.score(X_train, y_train), 
      "\nTesting R2:", pipeline.score(X_test, y_test))
print("Training rmse: ", rmse(y_train, pipeline.predict(X_train)), 
      "\nTesting rmse: ", rmse(y_test, pipeline.predict(X_test)))

Training R2: 0.949458309999 
Testing R2: 0.805791354315
Training rmse:  0.0860626603177 
Testing rmse:  0.191581296892


In [35]:
pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression(normalize=False))
])
pipeline.fit(X_train, y_train)
print("Training R2:", pipeline.score(X_train, y_train), 
      "\nTesting R2:", pipeline.score(X_test, y_test))
print("Training rmse: ", rmse(y_train, pipeline.predict(X_train)), 
      "\nTesting rmse: ", rmse(y_test, pipeline.predict(X_test)))

Training R2: 1.0 
Testing R2: -0.0940654372265
Training rmse:  1.8677334443e-10 
Testing rmse:  0.454716253677


In [39]:
from sklearn.linear_model import Lasso, Ridge

In [37]:
pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=0.1, normalize=False, max_iter=1000, random_state=1))
])
pipeline.fit(X_train, y_train)
print("Training R2:", pipeline.score(X_train, y_train), 
      "\nTesting R2:", pipeline.score(X_test, y_test))
print("Training rmse: ", rmse(y_train, pipeline.predict(X_train)), 
      "\nTesting rmse: ", rmse(y_test, pipeline.predict(X_test)))

Training R2: 0.71138896464 
Testing R2: 0.70052128895
Training rmse:  0.205658352862 
Testing rmse:  0.237904034246


In [50]:
#list(pipeline.steps[2][1].coef_)

In [40]:
pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler()),
    ("lasso", Lasso(alpha=0.05, normalize=False, max_iter=1000, random_state=1))
])
pipeline.fit(X_train, y_train)
print("Training R2:", pipeline.score(X_train, y_train), 
      "\nTesting R2:", pipeline.score(X_test, y_test))
print("Training rmse: ", rmse(y_train, pipeline.predict(X_train)), 
      "\nTesting rmse: ", rmse(y_test, pipeline.predict(X_test)))

Training R2: 0.82689423903 
Testing R2: 0.817390485556
Training rmse:  0.159274278891 
Testing rmse:  0.185772116843


In [51]:
pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler()),
    ("lasso", Ridge(alpha=0.05, normalize=False, max_iter=1000, random_state=1))
])
pipeline.fit(X_train, y_train)
print("Training R2:", pipeline.score(X_train, y_train), 
      "\nTesting R2:", pipeline.score(X_test, y_test))
print("Training rmse: ", rmse(y_train, pipeline.predict(X_train)), 
      "\nTesting rmse: ", rmse(y_test, pipeline.predict(X_test)))

Training R2: 0.999998258593 
Testing R2: 0.855496865858
Training rmse:  0.000505172839309 
Testing rmse:  0.165256085925


In [55]:
pipeline = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", StandardScaler()),
    ("lasso", Ridge(alpha=10.0, normalize=False, max_iter=1000, random_state=1))
])
pipeline.fit(X_train, y_train)
print("Training R2:", pipeline.score(X_train, y_train), 
      "\nTesting R2:", pipeline.score(X_test, y_test))
print("Training rmse: ", rmse(y_train, pipeline.predict(X_train)), 
      "\nTesting rmse: ", rmse(y_test, pipeline.predict(X_test)))

Training R2: 0.9999515295 
Testing R2: 0.861169940744
Training rmse:  0.00266519282959 
Testing rmse:  0.161979697473
