In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

In [2]:
# Import traing and testing data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
train_cleaned = train[:]

In [5]:
train_cleaned.shape

(1460, 81)

In [6]:
# Extract feature columns from both datasets and combine the two sets of features together
train_features = train_cleaned.iloc[:,1:80]
test_features = test.iloc[:,1:80]
whole_features = pd.concat([train_features,test_features])
whole_features.shape

(2919, 79)

In [7]:
# Create new features
whole_features['AgeBuilt'] = whole_features['YrSold'] - whole_features['YearBuilt']
whole_features['AgeRemodAdd'] = whole_features['YrSold'] - whole_features['YearRemodAdd']
whole_features['AgeGararge'] = whole_features['YrSold'] - whole_features['GarageYrBlt']

In [8]:
del whole_features['YearBuilt']
del whole_features['YearRemodAdd']
del whole_features['GarageYrBlt']

In [9]:
del whole_features['YrSold']

In [10]:
# Show the null values in train_cleaned
for key in whole_features.isnull().sum().iteritems():
    if key[1] != 0:
        print(key)

('MSZoning', 4)
('LotFrontage', 486)
('Alley', 2721)
('Utilities', 2)
('Exterior1st', 1)
('Exterior2nd', 1)
('MasVnrType', 24)
('MasVnrArea', 23)
('BsmtQual', 81)
('BsmtCond', 82)
('BsmtExposure', 82)
('BsmtFinType1', 79)
('BsmtFinSF1', 1)
('BsmtFinType2', 80)
('BsmtFinSF2', 1)
('BsmtUnfSF', 1)
('TotalBsmtSF', 1)
('Electrical', 1)
('BsmtFullBath', 2)
('BsmtHalfBath', 2)
('KitchenQual', 1)
('Functional', 2)
('FireplaceQu', 1420)
('GarageType', 157)
('GarageFinish', 159)
('GarageCars', 1)
('GarageArea', 1)
('GarageQual', 159)
('GarageCond', 159)
('PoolQC', 2909)
('Fence', 2348)
('MiscFeature', 2814)
('SaleType', 1)
('AgeGararge', 159)


In [11]:
# Fill the quantitative columns of which the corresponding qualitative columns are None with 0s
for feature in ['MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath',
                'GarageFinish']:
    whole_features[feature] = whole_features[feature].fillna(0)

In [12]:
# Remove the features with too many null values
for feature in ['PoolQC','Fence','MiscFeature','Alley']:
    del whole_features[feature]

In [22]:
data_stat = pd.concat([train['SalePrice'],whole_features[['GrLivArea','GarageArea',
                                                         'OverallQual','AgeBuilt','AgeGararge','Fireplaces','OverallCond']][0:1460]],axis=1)
data_stat = data_stat.dropna()
data_stat[0:300].to_csv('data/stat.csv')

In [23]:
# Fill some null values with default values
whole_features['SaleType'] = whole_features['SaleType'].fillna('oth')
whole_features['Functional'] = whole_features['Functional'].fillna('Typ')
whole_features['Exterior1st'] = whole_features['Exterior1st'].fillna('Other')
whole_features['Exterior2nd'] = whole_features['Exterior2nd'].fillna('Other')

In [14]:
# Fill quantitative null values with the mean of corresponding column
whole_features = whole_features.fillna(whole_features.mean())
    
# Fill qualitative null values with 'NoValue'
whole_features = whole_features.fillna('NoValue')


In [15]:
# Get dummy values
whole_features = pd.get_dummies(whole_features)

In [16]:
whole_features.shape

(2919, 291)

In [17]:
# Delete colmns with s of 0 in training set
for col in whole_features.columns:
    if whole_features[col][0:1460].std() == 0:
        del whole_features[col]
whole_features.shape

(2919, 286)

In [18]:
# Standardize quantitative variables
whole_features_std = whole_features[:]
for col in whole_features.columns:
    if not (whole_features[col].mean()<=1 and whole_features[col].mean()>=0):
        whole_features_std[col] = (whole_features[col]-whole_features[col].mean())/whole_features[col].std()

In [19]:
# Define X and y for modeling
y = train_cleaned.iloc[:,80:81]
X = whole_features[0:1460]

In [20]:
# Define standardized X
X_std = whole_features_std[0:1460]

In [224]:
# Show features highly correlated with target value
target_corr = pd.concat([X,y],axis=1).corr()['SalePrice']
target_corr[abs(target_corr.values)>0.5].sort_values()

ExterQual_TA     -0.589044
AgeBuilt         -0.523350
KitchenQual_TA   -0.519298
AgeRemodAdd      -0.509079
KitchenQual_Ex    0.504094
TotRmsAbvGrd      0.533723
BsmtQual_Ex       0.553105
FullBath          0.560664
1stFlrSF          0.605852
TotalBsmtSF       0.613581
GarageArea        0.623431
GarageCars        0.640409
GrLivArea         0.708624
OverallQual       0.790982
SalePrice         1.000000
Name: SalePrice, dtype: float64

In [22]:
# Try some basic linear regression models
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [23]:
# Ridge model

# Select the best regularizing parameter (alpha)
reg_ridge = linear_model.RidgeCV(alphas=[0.1,0.2,10,10.5,11,11.5,12,13,14,15,16,17,20])
reg_ridge.fit(X_std,y)
print('Best alpha for Ridge model: '+str(reg_ridge.alpha_))
# The best alpha to this model is 15

# Train the model based on train_cleaned
reg_ridge = linear_model.Ridge(alpha=15)
reg_ridge.fit(X_std,y)

np.sqrt(-cross_val_score(reg_ridge,X_std,y,scoring='neg_mean_squared_log_error')).mean()

Best alpha for Ridge model: 15.0


0.15652586601040683

In [24]:
# Lasso model

# Select the best regularizing parameter (alpha)
reg_lasso = linear_model.LassoCV(alphas=[5, 10, 15, 30, 50,75,100,105,110,120,125],tol=0.001) 
# The defalt tol is 0.0001, but the model does not converge until it is increased to 0.001
reg_lasso.fit(X_std,y)
print('Best alpha for Lasso model: '+str(reg_lasso.alpha_))

# Train the model based on train_cleaned
reg_lasso = linear_model.Lasso(alpha=75,tol=0.001)
reg_lasso.fit(X_std,y)
print(np.sqrt(-cross_val_score(reg_lasso,X_std,y,scoring='neg_mean_squared_log_error')).mean())

# Record the features excluded by Lasso
excluded = []
for i in range(len(X.columns)):
    if reg_lasso.coef_[i] == 0:
        excluded.append(X.columns[i])

  y = column_or_1d(y, warn=True)


Best alpha for Lasso model: 100
0.146814037608


In [25]:
# Exclude features based on Lasso's selection
whole_selected = whole_features[:]
for col in whole_selected.columns:
    if col in excluded:
        del whole_selected[col]

In [26]:
whole_selected_std = whole_features_std[:]
for col in whole_selected_std.columns:
    if col in excluded:
        del whole_selected_std[col]

In [27]:
X_selected = whole_selected[0:1460]

In [28]:
X_selected_std = whole_selected_std[0:1460]

In [29]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
reg_RF = RandomForestRegressor()
reg_RF.fit(X,y)
print(np.sqrt(-cross_val_score(reg_RF,X,y,scoring='neg_mean_squared_log_error')).mean())

  after removing the cwd from sys.path.
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.155329077449


  estimator.fit(X_train, y_train, **fit_params)


In [30]:
# Try Random Forest with selected X
reg_RF_sel = RandomForestRegressor()
reg_RF_sel.fit(X_selected,y)
print(np.sqrt(-cross_val_score(reg_RF_sel,X_selected,y,scoring='neg_mean_squared_log_error')).mean())

  This is separate from the ipykernel package so we can avoid doing imports until
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.157811390517


In [31]:
# Try Ridge with selected X
reg_ridge_sel = linear_model.RidgeCV(alphas=[0.1,0.2,10,10.5,11,11.5,12,13,14,15,16,17,20])
reg_ridge_sel.fit(X_selected_std,y)
print('Best alpha for Ridge model: '+str(reg_ridge_sel.alpha_))
# The best alpha to this model is 10

# Train the model based on train_cleaned
reg_ridge_sel = linear_model.Ridge(alpha=10)
reg_ridge_sel.fit(X_selected_std,y)

np.sqrt(-cross_val_score(reg_ridge,X_selected_std,y,scoring='neg_mean_squared_log_error')).mean()

Best alpha for Ridge model: 10.0


0.14959002873071567

In [43]:
# Try Lasso with selected X

# Select the best regularizing parameter (alpha)
reg_lasso_selected = linear_model.LassoCV(alphas=[5, 10, 15, 30, 50,75,100,105,110,120,125],tol=0.001) 
# The defalt tol is 0.0001, but the model does not converge until it is increased to 0.001
reg_lasso_selected.fit(X_selected_std,y)
print('Best alpha for Lasso model: '+str(reg_lasso_selected.alpha_))

# Train the model based on train_cleaned
reg_lasso_selected = linear_model.Lasso(alpha=50,tol=0.001)
reg_lasso_selected.fit(X_selected_std,y)
print(np.sqrt(-cross_val_score(reg_lasso,X_selected_std,y,scoring='neg_mean_squared_log_error')).mean())

# Show the features excluded by Lasso
excluded_2 = []
for i in range(len(X_selected.columns)):
    if reg_lasso_selected.coef_[i] == 0:
        excluded.append(X_selected.columns[i])
len(excluded_2)

Best alpha for Lasso model: 50
0.143871898546


  y = column_or_1d(y, warn=True)


0

In [36]:
# Predict on test set 
X_test = whole_features[1460:2919]
sub = pd.DataFrame(columns=['Id'],data=test['Id'])

In [37]:
X_test_std = whole_features_std[1460:2919]

In [38]:
X_test_selected = whole_selected[1460:2919]
X_test_selected_std = whole_selected_std[1460:2919]

In [238]:
# Predict using Ridge model
sub_ridge = sub
sub_ridge['SalePrice'] = reg_ridge.predict(X_test_std)
sub_ridge.to_csv('Data/sub_ridge.csv',index=None)
# Score on leaderboard: 0.14547

In [240]:
# Predict using Ridge model with selected features
sub_ridge_selected = sub
sub_ridge_selected['SalePrice'] = reg_ridge_sel.predict(X_test_selected_std)
sub_ridge_selected.to_csv('Data/sub_ridge_selected.csv',index=None)

# Socre on leaderboard: 0.14547

In [243]:
# Predict using Lasso model
sub_lasso = sub
sub_lasso['SalePrice'] = reg_lasso.predict(X_test_std)
sub_lasso.to_csv('Data/sub_lasso.csv',index=None)

# Score on leaderboard: 0.13611

In [40]:
# Predict using Lasso model with selected features
sub_lasso_selected = sub
sub_lasso_selected['SalePrice'] = reg_lasso_selected.predict(X_test_selected_std)
sub_lasso_selected.to_csv('data/sub_lasso_selected.csv',index=None)

# Score on leaderboard: 0.13913

In [250]:
# Predict using Random Forest
sub_RF = sub
sub_RF['SalePrice'] = reg_RF.predict(X_test)
sub_RF.to_csv('Data/sub_RF.csv',index=None)

# Socre on leaderboard: 0.15748

In [248]:
# Predict using Random Forest with selected features
sub_RF_selected = sub
sub_RF_selected['SalePrice'] = reg_RF_sel.predict(X_test_selected)
sub_RF_selected.to_csv('Data/sub_RF_selected.csv',index=None)

# Socre on leaderboard: 0.15145