In [None]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split

# sklearn :: models
from sklearn.linear_model import LinearRegression

# sklearn :: evaluation metrics
from sklearn.metrics import mean_absolute_error

sns.set_style('whitegrid')

# Problem definition

Apply regression models to predict the house pricing

# Load the data

In [None]:
#input
df_pricing = pd.read_csv('data/house_pricing.csv')
print(df_pricing.columns)
df_pricing.head()

# Feature Engineering 

In [None]:
# zipcode to numerical columns
df_zipcode = pd.get_dummies(df_pricing['zipcode'])
df = pd.concat([df_pricing, df_zipcode], axis=1).fillna(0.0)
print(list(df.columns))
df.head()

In [None]:
# select the columns
X_columns = ['bedrooms', 'bathrooms', 'grade', 'condition', 'waterfront', 'sqft_living15', 'sqft_lot15'] + list(df_zipcode.columns)
y_column = ['price']

# Model Training

In [None]:
# split the data

df = df.sample(frac=1).reset_index(drop=True)

threshold = 0.8
absolute_threshold = int(len(df)*threshold)

X_train = df.iloc[:absolute_threshold][X_columns]
y_train = df.iloc[:absolute_threshold][y_column]

X_test = df.iloc[absolute_threshold:][X_columns]
y_test = df.iloc[absolute_threshold:][y_column]

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

In [None]:
# split the data using sklearn

threshold = 0.8
X = df[X_columns]
y = df[y_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

print('X_train', X_train.shape)
print('y_train', y_train.shape)
print('X_test', X_test.shape)
print('y_test', y_test.shape)

In [None]:
# train a linear regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model Evaluation

In [None]:
mae = mean_absolute_error(y_test, y_pred)
print('MAE', round(mae, 2))
plt.scatter(y_test, y_pred, alpha=0.3)
plt.plot(range(0,5000000, 100), range(0,5000000, 100), '--r', alpha=0.3, label='Line1')
plt.title('Linear Regression')
plt.xlabel('True Value')
plt.ylabel('Predict Value')
plt.xlim([0, 5000000])
plt.ylim([0, 5000000])
plt.show()

# Cross Validation

In [None]:
# models = [
#     ('LinearRegression', LinearRegression()),
#     ('RandomForestRegressor10', RandomForestRegressor(n_estimators=10)),
#     ('RandomForestRegressor100', RandomForestRegressor(n_estimators=100, n_jobs=4)),
#     ('KNeighborsRegressor', KNeighborsRegressor()),
#     ('DecisionTreeRegressor', DecisionTreeRegressor())
# ]

# k = 10
# results = {}
# for m in models:
#     print('MODEL', m[0])
#     results[m[0]] = {'mae':[], 'rmse':[]}
#     kf = KFold(n_splits=k)
#     for train_index, test_index in kf.split(X):
#         X_train, X_test = X.values[train_index], X.values[test_index]
#         y_train, y_test = y.values[train_index], y.values[test_index]
#         model = m[1]
#         model.fit(X_train, y_train.ravel())
#         y_pred = model.predict(X_test)
#         mae = mean_absolute_error(y_test, y_pred)
#         rmse = np.sqrt(mean_squared_error(y_test, y_pred))
#         results[m[0]]['mae'].append(mae)
#         results[m[0]]['rmse'].append(rmse)

In [None]:
# for metric in ['mae', 'rmse']:
#     values = []
#     labels = []
#     for model, result_values in results.items():
#         for m, v in result_values.items():
#             if m == metric:
#                 labels.append(model)
#                 values.append(v)
#     plt.figure(figsize=(12,6))
#     plt.title(metric)
#     plt.boxplot(values)
#     plt.xticks(range(1, len(labels)+1), labels, rotation='horizontal')
#     plt.show()