# Load Data

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

train_df = pd.read_csv('https://raw.githubusercontent.com/wlifferth/build-an-ml-web-app/main/cleaned_data.csv', index_col='id')
train_df.head()

Unnamed: 0_level_0,city,state,bathrooms,bedrooms,livingArea,lotArea,lotUnit,price,homeType_CONDO,homeType_MULTI_FAMILY,homeType_SINGLE_FAMILY,homeType_TOWNHOUSE,median_income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Justin,TX,3.0,4.0,3078.0,7492.32,sqft,358100.0,0,0,1,0,46658.0
1,Litchfield Park,AZ,5.0,5.0,3722.0,6577.56,sqft,660000.0,0,0,1,0,57188.0
2,Anchorage,AK,2.0,3.0,1388.0,0.0,sqft,232500.0,1,0,0,0,66044.0
3,Berwick,ME,4.0,3.0,2972.0,79714.8,acres,551900.0,0,0,1,0,46016.0
4,Thornton,CO,3.0,2.0,1926.0,0.0,sqft,435000.0,0,0,0,1,65217.0


# Feature Selection

I did standardize the data, however I did not notice an improvement in model performance, and to keep things simple with the Kaggle Submission below, I decided to leave the data unstandardized.

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = pd.get_dummies(train_df.loc[:, 
                                   ['bedrooms', 'bathrooms', 'livingArea', 
                                    'price', 'homeType_CONDO', 
                                    'homeType_MULTI_FAMILY', 
                                    'homeType_SINGLE_FAMILY', 
                                    'homeType_TOWNHOUSE', 'median_income', 
                                    'state']], 
                                    columns=["state"])
std_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

Observe the correlation between the selected features from the dataset with housing price. These features were selected based on the data variable in the above cell. The goal here was to try to select features that had a higher correlation with housing price.

In [7]:
#data.corr().loc["price",:]

# Model Evaluation

I decided to go with Extreme Gradient Boosting (XGBoost). From observing Will's models, I noticed that the linear regression model performed suprisingly well compared to his Neural Network model (see linear regression model in the below cell). There's this principle in machine learning known as Occam's razor. To summarize, it states to pick the simplest model possible. Neural Networks are powerful but these models tend to be complicated. I then decided that I wanted a model with some nonlinearity in it. I started out with LASSO regression, but I was getting a similar performance to the linear regression model. This is why I selected XGBoost. It allows for regularization similar to LASSO regression, and it also provides several other hyperparameters which can be used to adjust decision tree ensembles when constructing our model.

Will's Linear Regression Model

In [10]:
# What if we one-hot encoded state?
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
X = pd.get_dummies(train_df.drop(['city', 'lotUnit', 'price'], axis=1), columns=['state'])
y = train_df['price']

errors = []
for i in range(4):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=i)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    error = mean_absolute_error(predictions, y_test)
    print(error)
    errors.append(error)
print(f'Mean Error {np.mean(errors)}')

# Nice!

60397.58345067544
60902.298618907844
61131.92083270111
61237.019125373896
Mean Error 60917.20550691457


## Evaluating the XGBoost Model with the selected Features

In [11]:
X = data.drop(['price'], axis=1)
y = data['price']

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

A 5 fold cross-validation will be used for the model. To summarize, during model training, the data will be split into 5 groups and these groups will be shuffled in a random order during training.

In [12]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
from sklearn.model_selection import GridSearchCV
model = XGBRegressor(objective='reg:squarederror')
grid_search = GridSearchCV(model,
                        {'n_estimators': [500], # default: 100  
                        'learning_rate': [0.2], # default: 0.3
                        'max_depth': [6], # default: 6
                        'gamma': [6], # default: 0
                        'min_child_weight': [6], # default: 6
                        'subsample': [1], # default: 1
                        'colsample_bytree': [1], # default: 1
                        'colsample_bylevel': [0.8], # default: 1
                        'colsample_bynode': [1], # default: 1
                        'scale_pos_weight': [1], # default: 1
                        'max_delta_step': [0], # default: 0
                        'lambda': [3], # default: 1
                        'alpha': [0], # default: 0
                        },
                        cv = cv,
                        scoring = ('neg_mean_absolute_error'),
                        return_train_score = True,
                        n_jobs = -2,
                        verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=1, colsample_bytree=1, gamma=6, lambda=3, learning_rate=0.2, max_delta_step=0, max_depth=6, min_child_weight=6, n_estimators=500, scale_pos_weight=1, subsample=1;, score=(train=-33728.069, test=-57672.075) total time=  11.3s
[CV 2/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=1, colsample_bytree=1, gamma=6, lambda=3, learning_rate=0.2, max_delta_step=0, max_depth=6, min_child_weight=6, n_estimators=500, scale_pos_weight=1, subsample=1;, score=(train=-34242.280, test=-55965.784) total time=  11.2s
[CV 3/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=1, colsample_bytree=1, gamma=6, lambda=3, learning_rate=0.2, max_delta_step=0, max_depth=6, min_child_weight=6, n_estimators=500, scale_pos_weight=1, subsample=1;, score=(train=-33618.347, test=-56672.116) total time=  11.1s
[CV 4/5] END alpha=0, colsample_bylevel=0.8, colsample_bynode=1, colsample_bytre

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=XGBRegressor(objective='reg:squarederror'), n_jobs=-2,
             param_grid={'alpha': [0], 'colsample_bylevel': [0.8],
                         'colsample_bynode': [1], 'colsample_bytree': [1],
                         'gamma': [6], 'lambda': [3], 'learning_rate': [0.2],
                         'max_delta_step': [0], 'max_depth': [6],
                         'min_child_weight': [6], 'n_estimators': [500],
                         'scale_pos_weight': [1], 'subsample': [1]},
             return_train_score=True, scoring='neg_mean_absolute_error',
             verbose=3)

In [14]:
grid_search.best_estimator_

XGBRegressor(alpha=0, colsample_bylevel=0.8, gamma=6, lambda=3,
             learning_rate=0.2, max_depth=6, min_child_weight=6,
             n_estimators=500, objective='reg:squarederror')

This is the final model selected from grid search. It's not necessary to do the below cells, but I wanted to view the model's performance similar to how Will did in his sessions.

In [15]:
model = XGBRegressor(objective='reg:squarederror',
                        n_estimators= 500, 
                        learning_rate= 0.2, 
                        max_depth= 6, 
                        gamma= 6, 
                        min_child_weight= 6, 
                        subsample= 1,
                        colsample_bytree= 1,
                        colsample_bylevel= 0.8, 
                        colsample_bynode= 1, 
                        scale_pos_weight= 1,
                        max_delta_step= 0,
                        reg_lambda= 3, 
                        alpha= 0
                     )
model.fit(X_train, y_train)
prediction = model.predict(X_test)
error = mean_absolute_error(prediction, y_test)
print(error)

55446.05348656473


In [16]:
# Scores from each of the folds in a 5-fold cross validation of the model
from sklearn.model_selection import cross_val_score
np.abs(cross_val_score(model, X, y, scoring='neg_mean_absolute_error'))

array([54464.65054014, 55162.65687897, 55182.90758185, 54998.76381587,
       54635.10173876])

# Submission

In [17]:
# Finally, lets cover how to submit to the kaggle competition

final_model = XGBRegressor(objective='reg:squarederror',
                        n_estimators= 500, 
                        learning_rate= 0.2, 
                        max_depth= 6, 
                        gamma= 6, 
                        min_child_weight= 6, 
                        subsample= 1,
                        colsample_bytree= 1,
                        colsample_bylevel= 0.8, 
                        colsample_bynode= 1, 
                        scale_pos_weight= 1,
                        max_delta_step= 0,
                        reg_lambda= 3, 
                        alpha= 0
                     )

final_training_input = pd.get_dummies(train_df.loc[:, ['bedrooms', 'bathrooms', 'livingArea', 'homeType_CONDO', 'homeType_MULTI_FAMILY', 'homeType_SINGLE_FAMILY', 'homeType_TOWNHOUSE', 'median_income', 'state']], columns=["state"])

X = final_training_input
y = train_df['price']
final_model.fit(X, y)

XGBRegressor(alpha=0, colsample_bylevel=0.8, gamma=6, learning_rate=0.2,
             max_depth=6, min_child_weight=6, n_estimators=500,
             objective='reg:squarederror', reg_lambda=3)

In [18]:
# First we have to do all the preprocessing we did on our training dataset on our testing dataset:

test = pd.read_csv('https://raw.githubusercontent.com/wlifferth/build-an-ml-web-app/main/test.csv', index_col='id')

test.drop(['homeStatus', 'dateSold', 'address'], axis=1, inplace=True)

def convert_lot_area(row):
    if row['lotUnit'] == 'acres':
        return row['lotArea'] * 43560
    else:
        return row['lotArea']

test['lotArea'] = test.apply(convert_lot_area, axis=1)

test.drop(['lotUnit'], inplace=True, axis=1)

test = pd.get_dummies(test, columns=['homeType'])

print(test.head())

zip_code_df = pd.read_csv('https://raw.githubusercontent.com/wlifferth/build-an-ml-web-app/main/median_income_by_zip_code.csv')

zip_code_df['median_income']

test = pd.merge(test, zip_code_df, how='left', left_on='zipcode', right_on='zip_code').set_index(test.index)

test['median_income'].fillna(test['median_income'].mean(), inplace=True)

test.drop(['zipcode', 'zip_code'], axis=1, inplace=True)

test.head()

       zipcode           city  ... homeType_SINGLE_FAMILY  homeType_TOWNHOUSE
id                             ...                                           
22805  19970.0     Ocean View  ...                      1                   0
18392  27704.0         Durham  ...                      1                   0
15215  20733.0      Churchton  ...                      1                   0
20235  33064.0  Pompano Beach  ...                      1                   0
13810   6239.0      Killingly  ...                      1                   0

[5 rows x 11 columns]


Unnamed: 0_level_0,city,state,bathrooms,bedrooms,livingArea,lotArea,homeType_CONDO,homeType_MULTI_FAMILY,homeType_SINGLE_FAMILY,homeType_TOWNHOUSE,median_income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
22805,Ocean View,DE,4.0,5.0,4100.0,10454.4,0,0,1,0,49786.0
18392,Durham,NC,3.0,4.0,2055.0,12632.4,0,0,1,0,37513.0
15215,Churchton,MD,2.0,3.0,1530.0,4791.6,0,0,1,0,41087.0
20235,Pompano Beach,FL,1.0,3.0,744.0,6120.0,0,0,1,0,37070.0
13810,Killingly,CT,3.0,6.0,4166.0,37.0,0,0,1,0,33438.0


In [19]:
final_input = pd.get_dummies(test.loc[:, ['bedrooms', 'bathrooms', 'livingArea', 'homeType_CONDO', 'homeType_MULTI_FAMILY', 'homeType_SINGLE_FAMILY', 'homeType_TOWNHOUSE', 'median_income', 'state']], columns=["state"])

In [20]:
test['price'] = final_model.predict(final_input)

In [21]:
test.head()

Unnamed: 0_level_0,city,state,bathrooms,bedrooms,livingArea,lotArea,homeType_CONDO,homeType_MULTI_FAMILY,homeType_SINGLE_FAMILY,homeType_TOWNHOUSE,median_income,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
22805,Ocean View,DE,4.0,5.0,4100.0,10454.4,0,0,1,0,49786.0,759542.75
18392,Durham,NC,3.0,4.0,2055.0,12632.4,0,0,1,0,37513.0,390065.21875
15215,Churchton,MD,2.0,3.0,1530.0,4791.6,0,0,1,0,41087.0,272682.6875
20235,Pompano Beach,FL,1.0,3.0,744.0,6120.0,0,0,1,0,37070.0,209613.1875
13810,Killingly,CT,3.0,6.0,4166.0,37.0,0,0,1,0,33438.0,582188.5625


In [22]:
# Export model to the web app

'''
import pickle

with open('my_model.pkl', mode='wb+') as model_file:
  pickle.dump(final_model, model_file)
'''

"\nimport pickle\n\nwith open('my_model.pkl', mode='wb+') as model_file:\n  pickle.dump(final_model, model_file)\n"

In [23]:
#test['price'].to_csv('2022-01-15-submission.csv', index_label='id')