In [336]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics
import statsmodels.formula.api as sm
%matplotlib inline

In [337]:
train = pd.read_csv('../../../Data/train.csv')
test = pd.read_csv('../../../Data/test.csv')

In [338]:
train = train.drop(columns = ['Id'], axis = 1)
test = test.drop(columns = ['Id'], axis = 1)

In [339]:
train = train[train['GrLivArea']<4500]
train['SalePrice'] = np.log1p(train['SalePrice'])
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice
features = pd.concat([train, test], sort = True).reset_index(drop=True)
features.drop(['SalePrice'], axis=1, inplace=True)

In [340]:
def missing_values(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    ms = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
    ms = ms[ms['Percent']>0]
    return ms

In [341]:
missing_values(features)

Unnamed: 0,Total,Percent
PoolQC,2908,99.691464
MiscFeature,2812,96.400411
Alley,2719,93.212204
Fence,2346,80.425094
FireplaceQu,1420,48.680151
LotFrontage,486,16.660953
GarageFinish,159,5.450806
GarageQual,159,5.450806
GarageYrBlt,159,5.450806
GarageCond,159,5.450806


In [342]:
features = features.dropna(thresh = len(features)*0.70,axis =1)

In [343]:
for i in features.select_dtypes(include = np.number):
    features[i].fillna(features[i].mean(), inplace = True)

In [344]:
for i in features.select_dtypes(include = 'object'):
    features[i].fillna(features[i].mode()[0], inplace = True)

In [345]:
features['SalePrice'] = y_train

In [346]:
corr = features.corr()['SalePrice'][:-1]

In [347]:
golden_features = corr[abs(corr)>=0.3].sort_values(ascending = False)

In [348]:
features['Haspool'] = features['PoolArea'].apply(lambda x:1 if x>0 else 0)
features['HasFireplace'] = features['Fireplaces'].apply(lambda x: 1 if x>0 else 0)
features['HasBsmt'] = features['BsmtFinSF1'].apply(lambda x:1 if x>0 else 0)
features['HasGarage'] = features['GarageArea'].apply(lambda x: 1 if x>0 else 0)
features['HasPool'] = features['PoolArea'].apply(lambda x: 1 if x>0 else 0)
features['Has2ndFloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)

In [349]:
features.corr()['SalePrice'].sort_values(ascending = False)

SalePrice        1.000000
OverallQual      0.302483
GarageCars       0.276725
GrLivArea        0.275293
GarageArea       0.264268
TotalBsmtSF      0.237815
FullBath         0.234389
YearBuilt        0.230227
1stFlrSF         0.224476
HasFireplace     0.223300
Fireplaces       0.218381
GarageYrBlt      0.218089
YearRemodAdd     0.213907
MasVnrArea       0.187499
TotRmsAbvGrd     0.187203
LotFrontage      0.162804
BsmtFinSF1       0.160902
BsmtFullBath     0.134110
2ndFlrSF         0.131564
WoodDeckSF       0.126298
LotArea          0.115622
HalfBath         0.108760
HasGarage        0.106521
BedroomAbvGr     0.091647
OpenPorchSF      0.064289
HasBsmt          0.063468
BsmtUnfSF        0.059725
Has2ndFloor      0.051744
ScreenPorch      0.046841
MoSold           0.036548
HasPool          0.035872
Haspool          0.035872
PoolArea         0.030601
BsmtFinSF2       0.016142
3SsnPorch        0.013647
BsmtHalfBath    -0.018617
MiscVal         -0.019472
LowQualFinSF    -0.021352
OverallCond 

In [350]:
features.shape

(2917, 81)

In [351]:
final_features = pd.get_dummies(features)

In [352]:
final_features.corr()['SalePrice'].sort_values(ascending = False).head()

SalePrice      1.000000
OverallQual    0.302483
GarageCars     0.276725
GrLivArea      0.275293
GarageArea     0.264268
Name: SalePrice, dtype: float64

In [353]:
X = final_features.drop('SalePrice', axis = 1)
X = X.iloc[len(y_train):-1]

In [354]:
X_train, X_test, y_train, y_test = train_test_split(X,y_train, test_size = 0.2, shuffle = False)

In [356]:
lm = LinearRegression()

In [357]:
lm.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [358]:
b = float(lm.intercept_)

In [359]:
coeff = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient'])

In [360]:
coeff.sort_values(by = 'Coefficient',ascending = False)

Unnamed: 0,Coefficient
TotalBsmtSF,4.993354e+07
Haspool,3.395787e+05
CentralAir_Y,2.585833e+05
CentralAir_N,2.585832e+05
BsmtExposure_Av,1.646330e+05
BsmtExposure_No,1.646330e+05
BsmtExposure_Gd,1.646330e+05
BsmtExposure_Mn,1.646330e+05
PavedDrive_Y,1.556431e+05
PavedDrive_N,1.556430e+05


In [361]:
train_lm = lm.predict(X_train)

print('Training MSE')
print('Linear:', metrics.mean_squared_error(y_train, train_lm))

Training MSE
Linear: 0.13002128988736106


In [362]:
test_lm = lm.predict(X_test)

print('Testing MSE')
print('Linear:', metrics.mean_squared_error(y_test, test_lm))

Testing MSE
Linear: 35098372.865878925


In [368]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [370]:
ridge = Ridge()
lasso = Lasso(alpha=0.01)

In [371]:
ridge.fit(X_train, y_train)
lasso.fit(X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [366]:
# check training accuracy
train_lm = lm.predict(X_train)
train_ridge = ridge.predict(X_train)

print('Training MSE')
print('Linear:', metrics.mean_squared_error(y_train, train_lm))
print('Ridge :', metrics.mean_squared_error(y_train, train_ridge))

Training MSE
Linear: 0.13002128988736106
Ridge : 0.1307919644962758


In [367]:
test_lm = lm.predict(X_test)
test_ridge = ridge.predict(X_test)

print('Testing MSE')
print('Linear:', metrics.mean_squared_error(y_test, test_lm))
print('Ridge :', metrics.mean_squared_error(y_test, test_ridge))

Testing MSE
Linear: 35098372.865878925
Ridge : 0.17471548307049042


In [373]:
# check training accuracy
train_lasso = lasso.predict(X_train)

print('Training MSE')
print('Lasso :', metrics.mean_squared_error(y_train, train_lasso))

Training MSE
Lasso : 0.15718773785788623


In [374]:
test_lasso = lasso.predict(X_test)

print('Testing MSE')
print('Lasso :', metrics.mean_squared_error(y_test, test_lasso))

Testing MSE
Lasso : 0.15164721531913367
