# Price prediction of short-term flat rentals in London

* [Data preparation](#Data-preparation)
* [Data pipeline](#Data-pipeline)
* [Predictive modeling](#Predictive-modeling)
* [Cross validation](#Cross-validation)
* [Grid search](#Grid-search)

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
pd.options.display.max_rows = None

In [3]:
home_dir = Path.home()
inside_airbnb_data_dir = home_dir / 'Programming/data/inside-airbnb/london'
inside_airbnb_work_dir = home_dir / 'Programming/Python/machine-learning-exercises/short-term-rents-in-london'

In [4]:
plots_dir = inside_airbnb_work_dir / 'plots'
plots_dir.mkdir(parents=True, exist_ok=True)
hist_dir = plots_dir / 'histograms'
hist_dir.mkdir(parents=True, exist_ok=True)

## Data preparation

In [5]:
inside_airbnb_data_file = inside_airbnb_data_dir / 'selected_short_term_rentals_with_distances.csv'
inside_airbnb_df = pd.read_csv(inside_airbnb_data_file, keep_default_na=False, thousands=',')

In [6]:
inside_airbnb_df.drop(['room_type', 'nearest_station'], axis=1, inplace=True)
inside_airbnb_df['borough'] = inside_airbnb_df['borough'].replace({r'\s': r'_'}, regex=True)

In [7]:
inside_airbnb_df = inside_airbnb_df.loc[inside_airbnb_df['borough'] != 'Sutton']

In [8]:
inside_airbnb_df[['amenity_1', 'amenity_2', 'amenity_3']] = inside_airbnb_df['amenities'].str.split(',', expand=True)
inside_airbnb_df = inside_airbnb_df.drop('amenities', axis=1)

In [9]:
inside_airbnb_df['log_price'] = np.log1p(inside_airbnb_df['price'])
inside_airbnb_df = inside_airbnb_df.drop('price', axis=1)

In [10]:
inside_airbnb_df = inside_airbnb_df.drop(['latitude', 'longitude'], axis=1)

In [11]:
inside_airbnb_df.head()

Unnamed: 0,borough,property_type,bathrooms,bedrooms,minimum_nights,crime_rate,distance_to_station,amenity_1,amenity_2,amenity_3,log_price
0,Lambeth,Entire rental unit,1.0,1.0,30,137.98,0.520193,,,,5.01728
1,Kensington_and_Chelsea,Entire rental unit,2.0,2.0,91,118.02,0.956708,Nightlife,Restaurant,,5.278115
2,Brent,Entire rental unit,1.5,2.0,30,117.59,0.481797,Grocery Store,,,4.430817
3,Kensington_and_Chelsea,Entire rental unit,1.0,2.0,91,118.02,0.398479,Restaurant,,,5.666427
4,Westminster,Entire rental unit,3.0,4.0,186,132.94,0.200253,Restaurant,Retail,,6.804615


In [12]:
df_full_train, df_test = train_test_split(inside_airbnb_df, test_size=0.2, random_state=33, stratify=inside_airbnb_df['borough'])
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=33, stratify=df_full_train['borough'])

In [13]:
X_train = df_train.drop(['log_price'], axis=1)
y_train = df_train['log_price'].copy()
X_val = df_val.drop(['log_price'], axis=1)
y_val = df_val['log_price'].copy()
X_test = df_test.drop(['log_price'], axis=1)
y_test = df_test['log_price'].copy()

In [14]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Data pipeline

In [15]:
cat_attribs = ['borough', 'property_type', 'amenity_1', 'amenity_2', 'amenity_3']
num_attribs = ['bathrooms', 'bedrooms', 'minimum_nights', 'crime_rate', 'distance_to_station']

In [16]:
num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
)

In [17]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False),
)

In [18]:
preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

In [19]:
X_train_prepared = preprocessing.fit_transform(X_train)  # fitting only occurs here!
X_val_prepared = preprocessing.transform(X_val)          # using 'preprocessing' object to transform data frame
X_test_prepared = preprocessing.transform(X_test)        # using 'preprocessing' object to transform data frame

In [20]:
X_train_prepared_df = pd.DataFrame(
    data=X_train_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_train.index,
)

In [21]:
X_val_prepared_df = pd.DataFrame(
    data=X_val_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_val.index,
)

In [22]:
X_test_prepared_df = pd.DataFrame(
    data=X_test_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_test.index,
)

In [23]:
print(f'Training size: {round(len(X_train_prepared_df)/len(inside_airbnb_df), 5):>10}')
print(f'Validation size: {round(len(X_val_prepared_df)/len(inside_airbnb_df), 5):>8}')
print(f'Testing size: {round(len(X_test_prepared_df)/len(inside_airbnb_df), 5):>11}')

Training size:    0.59946
Validation size:  0.20027
Testing size:     0.20027


## Predictive modeling

### Linear regression

In [24]:
lr = LinearRegression()
lr.fit(X_train_prepared_df, y_train)

In [25]:
y_pred_lr = lr.predict(X_val_prepared_df)
lr_rmse = root_mean_squared_error(y_val, y_pred_lr)
print(f'RMSE for linear regression: {round(lr_rmse, 5)}')

RMSE for linear regression: 0.55536


In [26]:
lr_r2_score = r2_score(y_val, y_pred_lr)
print(f'R2 for linear regression: {round(lr_r2_score, 5)}')

R2 for linear regression: 0.20774


### Decision trees

In [27]:
dtr = DecisionTreeRegressor(random_state=42)
dtr.fit(X_train_prepared_df, y_train)

In [28]:
y_pred_dtr = dtr.predict(X_val_prepared_df)
dtr_rmse = root_mean_squared_error(y_val, y_pred_dtr)
print(f'RMSE for decision tree: {round(dtr_rmse, 5)}')

RMSE for decision tree: 0.6699


In [29]:
dtr_r2_score = r2_score(y_val, y_pred_dtr)
print(f'R2 for linear regression: {round(dtr_r2_score, 5)}')

R2 for linear regression: -0.1528


### Random forests

In [30]:
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train_prepared_df, y_train)

In [31]:
y_pred_rfr = rfr.predict(X_val_prepared_df)
rfr_rmse = root_mean_squared_error(y_val, y_pred_rfr)
print(f'RMSE for random forest regressor: {round(rfr_rmse, 5)}')

RMSE for random forest regressor: 0.51865


In [32]:
rfr_r2_score = r2_score(y_val, y_pred_rfr)
print(f'R2 for random forest regressor: {round(rfr_r2_score, 5)}')

R2 for random forest regressor: 0.30899


### Stocastic gradient descent regressor

In [33]:
sgdr = SGDRegressor(random_state=42)
sgdr.fit(X_train_prepared_df, y_train)

In [34]:
y_pred_sgdr = sgdr.predict(X_val_prepared_df)
sgdr_rmse = root_mean_squared_error(y_val, y_pred_sgdr)
print(f'RMSE for stocastic gradient descent: {round(sgdr_rmse, 5)}')

RMSE for stocastic gradient descent: 0.5723


In [35]:
sgdr_r2_score = r2_score(y_val, y_pred_sgdr)
print(f'R2 for stocastic gradient descent: {round(sgdr_r2_score, 5)}')

R2 for stocastic gradient descent: 0.15866


### Support vector regressor

In [36]:
svr = SVR()
svr.fit(X_train_prepared_df, y_train)

In [37]:
y_pred_svr = rfr.predict(X_val_prepared_df)
svr_rmse = root_mean_squared_error(y_val, y_pred_svr)
print(f'RMSE for support vector regressor: {round(svr_rmse, 5)}')

RMSE for support vector regressor: 0.51865


In [38]:
svr_r2_score = r2_score(y_val, y_pred_svr)
print(f'R2 for support vector regressor: {round(svr_r2_score, 5)}')

R2 for support vector regressor: 0.30899


## Cross validation

In [39]:
lr_rmses = -cross_val_score(lr, X_train_prepared_df, y_train, scoring='neg_root_mean_squared_error', cv=10)

In [40]:
print(f"Mean and std dev: {pd.Series(lr_rmses).describe().loc['mean']:.5f} ± {pd.Series(lr_rmses).describe().loc['std']:.5f}")

Mean and std dev: 0.65274 ± 0.34644


In [41]:
dtr_rmses = -cross_val_score(dtr, X_train_prepared_df, y_train, scoring='neg_root_mean_squared_error', cv=10)

In [42]:
print(f"Mean and std dev: {pd.Series(dtr_rmses).describe().loc['mean']:.5f} ± {pd.Series(dtr_rmses).describe().loc['std']:.5f}")

Mean and std dev: 0.62616 ± 0.10326


In [43]:
rfr_rmses = -cross_val_score(rfr, X_train_prepared_df, y_train, scoring='neg_root_mean_squared_error', cv=10)

In [44]:
print(f"Mean and std dev: {pd.Series(rfr_rmses).describe().loc['mean']:.5f} ± {pd.Series(rfr_rmses).describe().loc['std']:.5f}")

Mean and std dev: 0.51247 ± 0.10872


In [45]:
sgdr_rmses = -cross_val_score(sgdr, X_train_prepared_df, y_train, scoring='neg_root_mean_squared_error', cv=10)

In [46]:
print(f"Mean and std dev: {pd.Series(sgdr_rmses).describe().loc['mean']:.5f} ± {pd.Series(sgdr_rmses).describe().loc['std']:.5f}")

Mean and std dev: 0.63752 ± 0.31528


In [47]:
svr_rmses = -cross_val_score(svr, X_train_prepared_df, y_train, scoring='neg_root_mean_squared_error', cv=10)

In [48]:
print(f"Mean and std dev: {pd.Series(svr_rmses).describe().loc['mean']:.5f} ± {pd.Series(svr_rmses).describe().loc['std']:.5f}")

Mean and std dev: 0.48424 ± 0.10824


## Grid search