In [15]:
# loading the dataset.
import pandas as pd


df = pd.read_csv('lib/data/housing_in_london_monthly_variables.csv')

In [16]:
# the purpose of this notebook is to create a model for the REST API so only limited preprocessing will be done.
# 'date' will be reduced to year to limit the number of unique catagories.
df

Unnamed: 0,date,area,average_price,code,houses_sold,no_of_crimes,borough_flag
0,1995-01-01,city of london,91449,E09000001,17.0,,1
1,1995-02-01,city of london,82203,E09000001,7.0,,1
2,1995-03-01,city of london,79121,E09000001,14.0,,1
3,1995-04-01,city of london,77101,E09000001,7.0,,1
4,1995-05-01,city of london,84409,E09000001,10.0,,1
...,...,...,...,...,...,...,...
13544,2019-09-01,england,249942,E92000001,64605.0,,0
13545,2019-10-01,england,249376,E92000001,68677.0,,0
13546,2019-11-01,england,248515,E92000001,67814.0,,0
13547,2019-12-01,england,250410,E92000001,,,0


In [17]:
# brief overview of the quantitative features.
df.describe()

Unnamed: 0,average_price,houses_sold,no_of_crimes,borough_flag
count,13549.0,13455.0,7439.0,13549.0
mean,263519.7,3893.994129,2158.352063,0.733338
std,187617.5,12114.402476,902.087742,0.44223
min,40722.0,2.0,0.0,0.0
25%,132380.0,247.0,1623.0,0.0
50%,222919.0,371.0,2132.0,1.0
75%,336843.0,3146.0,2582.0,1.0
max,1463378.0,132163.0,7461.0,1.0


In [18]:
# nearly half of the no_of_crime values are NaN values.
# catboost can handle NaN values automatically by handling them as minimum values so there's no need for further preprocessing.
df.isna().sum()

date                0
area                0
average_price       0
code                0
houses_sold        94
no_of_crimes     6110
borough_flag        0
dtype: int64

In [19]:
# converting date values to years only.
df['date'] = pd.to_datetime(df.date, format='%Y-%m-%d').dt.year

In [20]:
from sklearn.model_selection import train_test_split
import numpy as np


# creating train and test sets.
# catagorical features indices need to be predefined to benefit from catboosts' catagorical features support.
X = df.drop(columns=['average_price', 'code'], axis=1)
y = df.average_price
categorical_features_indices = np.where(X.dtypes != np.float)[0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X.dtypes != np.float)[0]


In [21]:
print(X_train.dtypes)

date              int64
area             object
houses_sold     float64
no_of_crimes    float64
borough_flag      int64
dtype: object


In [22]:
# quick peek check for typos which may cause problems when handling user queries laters.
pd.unique(df.area)

array(['city of london', 'barking and dagenham', 'barnet', 'bexley',
       'brent', 'bromley', 'camden', 'croydon', 'ealing', 'enfield',
       'tower hamlets', 'greenwich', 'hackney', 'south east',
       'hammersmith and fulham', 'haringey', 'harrow', 'havering',
       'hillingdon', 'hounslow', 'islington', 'kensington and chelsea',
       'kingston upon thames', 'lambeth', 'lewisham', 'merton', 'newham',
       'redbridge', 'richmond upon thames', 'southwark', 'sutton',
       'waltham forest', 'wandsworth', 'westminster', 'inner london',
       'outer london', 'north east', 'north west', 'yorks and the humber',
       'east midlands', 'west midlands', 'east of england', 'london',
       'south west', 'england'], dtype=object)

In [23]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error


# in an actual ml project, I tend to spend much more time on exploratory data analysis and date cleaning.
# I want to focus on building the REST API for this project, so I've chosen catboost which is fast and requires minimal preprocessing of catagorical features.
model = CatBoostRegressor(random_seed=42)
model.fit(X_train, y_train, cat_features=categorical_features_indices)
preds = model.predict(X_test)
rmse = mean_squared_error(y_test, preds, squared=False)
print(rmse)


Learning rate set to 0.058971
0:	learn: 180756.0806400	total: 192ms	remaining: 3m 11s
1:	learn: 172234.4050560	total: 228ms	remaining: 1m 53s
2:	learn: 164758.8865939	total: 266ms	remaining: 1m 28s
3:	learn: 157397.8905413	total: 308ms	remaining: 1m 16s
4:	learn: 150452.9006857	total: 343ms	remaining: 1m 8s
5:	learn: 144055.2461103	total: 378ms	remaining: 1m 2s
6:	learn: 137986.9902759	total: 413ms	remaining: 58.6s
7:	learn: 132687.7909302	total: 449ms	remaining: 55.7s
8:	learn: 127681.0495617	total: 484ms	remaining: 53.3s
9:	learn: 122724.5339570	total: 545ms	remaining: 54s
10:	learn: 118051.8489151	total: 579ms	remaining: 52.1s
11:	learn: 113730.8052315	total: 617ms	remaining: 50.8s
12:	learn: 109720.7017016	total: 658ms	remaining: 50s
13:	learn: 106024.1458517	total: 694ms	remaining: 48.9s
14:	learn: 102474.9847958	total: 733ms	remaining: 48.1s
15:	learn: 99160.1080850	total: 766ms	remaining: 47.1s
16:	learn: 96247.8051454	total: 799ms	remaining: 46.2s
17:	learn: 93303.3943883	total

In [24]:
# the rmse is about 6.3% of the mean of the average_price_column.
# the performance is not great and it's possible that we would see performance improvement with more careful data engineering and selection.
# but we can see that the model has been trained and ready to be deployed as a REST API.

mean_avg_prc = df.average_price.mean()
print('% of rmse over mean of average_price column')
print((rmse / mean_avg_prc) * 100)

% of rmse over mean of average_price column
6.910689559936787


In [25]:
# saving the trained model as a JSON file.
model.save_model(fname='lib/models/model.json', format='json')

In [26]:
model = CatBoostRegressor()
model_path = 'lib/models/model.json'
model.load_model(model_path, format='json')
sample = [{'date': 2005, 'area': 'islington', 'houses_sold': 13455, 'no_of_crimes': 1623, 'borough_flag': 0}]
df = pd.DataFrame.from_dict(sample).iloc[0, :]
test = X_test.iloc[811, :]
print(df)
print(df.dtypes)
prediction = model.predict(df)
print(prediction)

date                 2005
area            islington
houses_sold         13455
no_of_crimes         1623
borough_flag            0
Name: 0, dtype: object
object
290172.30014485377
