In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

raw_train_df = pd.read_csv('data/train.csv')
raw_test_df = pd.read_csv('data/test.csv')
# Separate target variable 'SalePric' from training set.
prices = raw_train_df['SalePrice']

In [2]:
# Merge training set with test set
all_df = pd.concat([raw_train_df.drop('SalePrice', axis=1), raw_test_df])
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 80 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-

In [3]:
# Define a functino which cleans our data. Most of missing values for continius data we fill with median value,
# but some of them are filled with zero value. Categorical data we fill with mode values.
def clean_data(df):
    df['BsmtFinSF1'].fillna(df['BsmtFinSF1'].median(), inplace=True)
    df['BsmtFinSF2'].fillna(df['BsmtFinSF2'].median(), inplace=True)
    df['BsmtUnfSF'].fillna(df['BsmtUnfSF'].median(), inplace=True)
    df['BsmtFinType1'].fillna(df['BsmtFinType1'].mode().values[0], inplace=True)
    df['BsmtFinType1'].fillna(df['BsmtFinType2'].mode().values[0], inplace=True)
    df['GarageYrBlt'].fillna(df['GarageYrBlt'].mode().values[0], inplace=True)
    df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace=True)
    df['MasVnrType'].fillna('None', inplace=True)
    df['MasVnrArea'].fillna(0, inplace=True)
    df['Electrical'].fillna(df['Electrical'].mode().values[0], inplace=True)
    df['MSSubClass'] = df['MSSubClass'].astype(str)
    df['OverallCond'] = df['OverallCond'].astype(str)
    df['MoSold'] = df['MoSold'].astype(str)
    df['TotalBsmtSF'].fillna(0, inplace=True)
    df['BsmtFullBath'].fillna(0, inplace=True)
    df['BsmtHalfBath'].fillna(0, inplace=True)
    df['GarageCars'].fillna(0, inplace=True)
    df['GarageArea'].fillna(0, inplace=True)
    #df.drop(['Exterior2nd', 'EnclosedPorch', 'RoofMatl', 'PoolQC', 'BsmtHalfBath', 'RoofStyle', 'PoolArea', 'MoSold', 'Alley', 'Fence', 'LandContour', 'MasVnrType', '3SsnPorch', 'LandSlope'], axis=1, inplace=True)
    return df

In [4]:
df = clean_data(all_df)
# Convert categorical features into dummy variables
df = pd.get_dummies(df)

In [5]:
# Split back our whole dataset into training and test datasets.
train = df.values[0:1460, :]
test = df.values[1460:, :]

In [6]:
y = prices.values

In [7]:
X = train

In [8]:
# Using sklearn.ensemble.IsolationForest we find and delete the outliers.
from sklearn.ensemble import IsolationForest as IF
i = IF().fit(X)
i = i.predict(X)
outliers_indicies = []
new_y = []
for idx, val in enumerate(i):
    if val == -1:
        outliers_indicies.append(idx)
    else:
        new_y.append(y[idx])
outliers_indicies
y = new_y
X = df[:1460].drop(outliers_indicies).values

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV



In [10]:
def grid_search(estimator, params, X, y, v=3):
    grid_search = GridSearchCV(estimator, params, cv=5, verbose=v, n_jobs=-1)
    grid_search.fit(X, y)
    sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
    print("Best score equals to " + str(grid_search.best_score_))
    return grid_search.best_params_

In [11]:
# Train the RandomForestRegressor
params = {
    'n_estimators': [10, 50, 100, 150],
    'n_jobs': [-1]
}
rfr_best_params = grid_search(RandomForestRegressor(), params, X, y, 1)
rfr_best_params

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    7.6s finished


Best score equals to 0.8842453591850488


{'n_estimators': 150, 'n_jobs': -1}

In [12]:
# Train the Lasso model
from sklearn.linear_model import Lasso

lasso_params = {
    'max_iter': [3000],
    'normalize': [True, False]
}
best_lasso_params = grid_search(Lasso(), lasso_params, X, y, 1)
best_lasso_params

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    1.8s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.3s finished


Best score equals to 0.901640574596752


{'max_iter': 3000, 'normalize': True}

In [13]:
# Train the Ridge model
from sklearn.linear_model import Ridge
ridge_params = {
    'alpha': [0.1, 0.5, 1, 5, 10],
    'fit_intercept': [True, False],
    'normalize': [True, False]
}

best_ridge_params = grid_search(Ridge(), ridge_params, X, y, 1)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best score equals to 0.9119676857758849


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished


In [14]:
best_ridge_params

{'alpha': 10, 'fit_intercept': False, 'normalize': True}

In [15]:
# Ridge model works better, than other, so we use it.
ridge_regr = Ridge(alpha=best_ridge_params['alpha'], 
                   fit_intercept=best_ridge_params['fit_intercept'], 
                   normalize=best_ridge_params['normalize']).fit(X, y)

In [16]:
# Make prediction and formatting them in order to prepare for submission.
predicted = ridge_regr.predict(test)
predicted_matrix = np.c_[[raw_test_df.values[:,0].astype(int), predicted.astype(int)]]
predicted_df = pd.DataFrame(predicted_matrix.T, columns=['Id', 'SalePrice'])
predicted_df.head()

Unnamed: 0,Id,SalePrice
0,1461,120391
1,1462,170566
2,1463,188893
3,1464,196965
4,1465,188261


In [16]:
predicted_df.to_csv('results/house_price_1-0.csv', index=False)