# Introduction

A data set describes the sale of individual residential property in Ames, Iowa from 2006 to 2010. The data set contains 2919 observations and a large number of explanatory variables (23 nominal, 23 ordinal, 14 discrete, and 20 continuous) involved in assessing home values.

In [1]:
import pandas as pd

test = pd.read_csv("test.csv")
test_shape = test.shape
train = pd.read_csv("train.csv")
train_shape = train.shape
print(test_shape)
print(train_shape)

(1459, 80)
(1460, 81)


In [2]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Feature Engineering

In [3]:
# Remove columns containing missing values above 30%
num_missing = train.isnull().sum()
drop_missing_cols = num_missing[num_missing > len(train)*0.3].index
train = train.drop(drop_missing_cols, axis=1)
train.isnull().sum().sort_values()

Id                 0
SaleCondition      0
Heating            0
HeatingQC          0
CentralAir         0
                ... 
GarageQual        81
GarageCond        81
GarageType        81
GarageFinish      81
LotFrontage      259
Length: 76, dtype: int64

In [4]:
# Convert data types of numeric columns which were set as object to float
train['LotFrontage'].astype(float)
train['MasVnrArea'].astype(float)
train['GarageYrBlt'].astype(float)

0       2003.0
1       1976.0
2       2001.0
3       1998.0
4       2000.0
         ...  
1455    1999.0
1456    1978.0
1457    1941.0
1458    1950.0
1459    1965.0
Name: GarageYrBlt, Length: 1460, dtype: float64

In [5]:
# Convert null values in categorical columns to the most frequent value
num_text_missing = train.select_dtypes(include=["object"]).isnull().sum()

for col in num_text_missing.index:
    train[col].fillna(train[col].value_counts().index[0], inplace=True)

for col in num_text_missing.index:
    test[col].fillna(test[col].value_counts().index[0], inplace=True)

In [6]:
# Convert null values in numeric columns to the means of the columns
train = train.fillna(train.mean())
test = test.fillna(test.mean())
train.isnull().sum().sort_values()

Id              0
Functional      0
TotRmsAbvGrd    0
KitchenQual     0
KitchenAbvGr    0
               ..
Exterior1st     0
RoofMatl        0
RoofStyle       0
ExterCond       0
SalePrice       0
Length: 76, dtype: int64

In [7]:
# Drop the variables that one category is dominant
text_columns = train.select_dtypes(include=["object"]).columns

for col in text_columns:
    if max(train[col].value_counts()) > len(train)*0.95:
        train = train.drop(col, axis=1)

### Change year columns to useful information

In [8]:
train["years_after_built"] = train["YrSold"] - train["YearBuilt"]
train["years_after_remod"] = train["YrSold"] - train["YearRemodAdd"]
train["years_after_garage"] = train["YrSold"] - train["GarageYrBlt"]
# Drop "Yr Sold", "GarageYrBlt" and "Year Built" because we don't need them anymore
train = train.drop(["YearRemodAdd", "YearBuilt", "GarageYrBlt"], axis=1)

In [9]:
test["years_after_built"] = test["YrSold"] - test["YearBuilt"]
test["years_after_remod"] = test["YrSold"] - test["YearRemodAdd"]
test["years_after_garage"] = test["YrSold"] - test["GarageYrBlt"]
# Drop "Yr Sold", "GarageYrBlt" and "Year Built" because we don't need them anymore
test = test.drop(["YearRemodAdd", "YearBuilt", "GarageYrBlt"], axis=1)

### Handle target leakages and useless columns
   - Since the target variable is SalesPrice, we don't have the information of sales when we predict. 

In [10]:
## Drop columns that leak info about the final sale
train = train.drop(["MoSold", "SaleCondition", "SaleType", "YrSold"], axis=1)

# Feature Selection

In [11]:
train_copy = train.copy()
text_cols = train_copy.select_dtypes(include=['object'])
text_c = text_cols.columns

In [12]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()

for col in text_c:
    train_copy[col] = class_le.fit_transform(train_copy[col].values)

In [13]:
abs_corr_coeffs = train_copy.corr()['SalePrice'].abs().sort_values()
drop_cols = abs_corr_coeffs[abs_corr_coeffs < 0.05].index
drop_cols = drop_cols.drop('Id')
drop_cols

Index(['BsmtFinSF2', 'LandContour', 'BsmtHalfBath', 'MasVnrType', 'MiscVal',
       'LowQualFinSF', 'BsmtFinType2', '3SsnPorch'],
      dtype='object')

In [14]:
# Drop the columns from the original data set
train = train.drop(drop_cols, axis=1)

In [15]:
# Select just the remaining text columns and convert to categorical
text_cols = train.select_dtypes(include=['object'])
for col in text_cols:
    train[col] = train[col].astype('category')
    
# Create dummy columns and add back to the dataframe!
train = pd.concat([train, pd.get_dummies(train.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)

text_cols_t = test.select_dtypes(include=['object'])
for col in text_cols_t:
    test[col] = test[col].astype('category')
    
# Create dummy columns and add back to the dataframe!
test = pd.concat([test, pd.get_dummies(test.select_dtypes(include=['category']))], axis=1).drop(text_cols_t,axis=1)

# Modeling

In [16]:
features = [c for c in train.columns if c not in ['SalePrice', 'Id']]

In [17]:
remove = ['HouseStyle_2.5Fin', 'Exterior1st_ImStucc', 'Exterior1st_Stone', 'Exterior2nd_Other', 'Electrical_Mix']

for col in remove:
    if col in features:
        features.remove(col)

In [18]:
from sklearn.preprocessing import minmax_scale

for col in features:
    train[col + "_scaled"] = minmax_scale(train[col])
    test[col + "_scaled"] = minmax_scale(test[col])

In [19]:
all_X  = train[features]
all_y  = train['SalePrice']
holdout_X = test[features]

### Linear Regression

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV

lr = LinearRegression()
selector = RFECV(lr,cv=10)
selector.fit(all_X,all_y)

optimized_columns = all_X.columns[selector.support_]
all_X_lr = all_X[optimized_columns]
holdout_X_lr = holdout_X[optimized_columns]

In [21]:
from sklearn.model_selection import cross_val_score
import numpy as np

from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(all_X_lr, all_y)
predictions = lr.predict(all_X_lr)
mse_value = mean_squared_error(all_y, predictions)
lr_rmse = np.sqrt(mse_value)
print('RMSE:', lr_rmse)
print('R-Squared', np.mean(cross_val_score(lr, all_X_lr, all_y ,cv=10)))

RMSE: 26517.316172849005
R-Squared 0.84395981370928


In [22]:
holdout_predictions = lr.predict(holdout_X_lr)

holdout_ids = test["Id"]
submission_df = {"Id": holdout_ids,
                 "SalePrice": holdout_predictions}
submission = pd.DataFrame(submission_df)

submission.to_csv("submission_lr.csv",index=False)
# 0.16935

### Penalized Regression - Ridge

In [23]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV()
selector = RFECV(ridge,cv=10)
selector.fit(all_X,all_y)

optimized_columns = all_X.columns[selector.support_]
all_X_rd = all_X[optimized_columns]
holdout_X_rd = holdout_X[optimized_columns]

In [24]:
ridge = RidgeCV(cv=10).fit(all_X_rd, all_y)
predictions = ridge.predict(all_X_rd)
mse_value = mean_squared_error(all_y, predictions)
ridge_rmse = np.sqrt(mse_value)
print('RMSE:', ridge_rmse)
print('R-Squared', np.mean(cross_val_score(ridge, all_X_rd, all_y ,cv=10)))

RMSE: 27135.539798549333
R-Squared 0.8583587600692864


In [25]:
holdout_predictions = ridge.predict(holdout_X_rd)

holdout_ids = test["Id"]
submission_df = {"Id": holdout_ids,
                 "SalePrice": holdout_predictions}
submission = pd.DataFrame(submission_df)

submission.to_csv("submission_rd.csv",index=False)
# 0.15594

### Random Forest

In [26]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
selector = RFECV(rf,cv=3)
selector.fit(all_X,all_y)

optimized_columns = all_X.columns[selector.support_]
all_X_rf = all_X[optimized_columns]
holdout_X_rf = holdout_X[optimized_columns]

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

from sklearn.ensemble import RandomForestRegressor
rf.fit(all_X_rf, all_y)
predictions = rf.predict(all_X_rf)
mse_value = mean_squared_error(all_y, predictions)
rf_rmse = np.sqrt(mse_value)
print('RMSE:', rf_rmse)
print('R-Squared', np.mean(cross_val_score(rf, all_X_rf, all_y, cv=10)))

RMSE: 10826.338593754937
R-Squared 0.8703153088962117


In [28]:
holdout_predictions = rf.predict(holdout_X_rf)

holdout_ids = test["Id"]
submission_df = {"Id": holdout_ids,
                 "SalePrice": holdout_predictions}
submission = pd.DataFrame(submission_df)

submission.to_csv("submission_rf.csv",index=False)
# 0.14731

### Neural Networks

Let's try to find the optimal hyperparameter using manual input since gridsearch takes too long to figure out.

In [29]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(hidden_layer_sizes=29, learning_rate_init = 0.49, max_iter=100, early_stopping=True, random_state=1)
regr.fit(all_X, all_y)
predictions = regr.predict(all_X)
mse_value = mean_squared_error(all_y, predictions)
nn_rmse = np.sqrt(mse_value)

print('RMSE:' , nn_rmse)
print('R-Squared', np.mean(cross_val_score(regr, all_X, all_y, cv=10)))

# 9:35956.01431678097 / 0.743612285628984
# 25:34017.14383713157 / 0.7260730285099377
# 29:30221.289187573217 / 0.741503115279839



RMSE: 30221.289187573217




R-Squared 0.741503115279839


In [30]:
holdout_predictions = regr.predict(holdout_X)

holdout_ids = test["Id"]
submission_df = {"Id": holdout_ids,
                 "SalePrice": holdout_predictions}
submission = pd.DataFrame(submission_df)

submission.to_csv("submission_nn.csv",index=False)
# 0.18200

# Model Comparison

In [31]:
RMSE=[lr_rmse, ridge_rmse, rf_rmse, nn_rmse]
col={'Root Mean Square Error': RMSE}
models=['Linear Regression','Ridge','Random Forest','Neural Networks']
dfplot=pd.DataFrame(data=col,index=models)
dfplot

Unnamed: 0,Root Mean Square Error
Linear Regression,26517.316173
Ridge,27135.539799
Random Forest,10826.338594
Neural Networks,30221.289188


Random Forest is the best model.