# Price prediction of short-term flat rentals in London

* [Data preparation](#Data-preparation)
* [Data pipeline](#Data-pipeline)
* [Predictive modeling](#Predictive-modeling)

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.options.display.max_rows = None

In [3]:
home_dir = Path.home()
inside_airbnb_data_dir = home_dir / 'Programming/data/inside-airbnb/london'
inside_airbnb_work_dir = home_dir / 'Programming/Python/machine-learning-exercises/short-term-rents-in-london'

In [4]:
plots_dir = inside_airbnb_work_dir / 'plots'
plots_dir.mkdir(parents=True, exist_ok=True)
hist_dir = plots_dir / 'histograms'
hist_dir.mkdir(parents=True, exist_ok=True)

## Data preparation

In [5]:
inside_airbnb_data_file = inside_airbnb_data_dir / 'selected_short_term_rentals_with_distances.csv'
inside_airbnb_df = pd.read_csv(inside_airbnb_data_file, keep_default_na=False, thousands=',')

In [6]:
inside_airbnb_df.drop(['room_type', 'nearest_station'], axis=1, inplace=True)
inside_airbnb_df['borough'] = inside_airbnb_df['borough'].replace({r'\s': r'_'}, regex=True)

In [7]:
inside_airbnb_df = inside_airbnb_df.loc[inside_airbnb_df['borough'] != 'Sutton']

In [8]:
inside_airbnb_df[['amenity_1', 'amenity_2', 'amenity_3']] = inside_airbnb_df['amenities'].str.split(',', expand=True)
inside_airbnb_df = inside_airbnb_df.drop('amenities', axis=1)

In [9]:
inside_airbnb_df['log_price'] = np.log1p(inside_airbnb_df['price'])
inside_airbnb_df = inside_airbnb_df.drop('price', axis=1)

In [10]:
inside_airbnb_df.head()

Unnamed: 0,borough,latitude,longitude,property_type,bathrooms,bedrooms,minimum_nights,crime_rate,distance_to_station,amenity_1,amenity_2,amenity_3,log_price
0,Lambeth,51.491476,-0.111514,Entire rental unit,1.0,1.0,30,137.98,0.520193,,,,5.01728
1,Kensington_and_Chelsea,51.48566,-0.18415,Entire rental unit,2.0,2.0,91,118.02,0.956708,Nightlife,Restaurant,,5.278115
2,Brent,51.53899,-0.19744,Entire rental unit,1.5,2.0,30,117.59,0.481797,Grocery Store,,,4.430817
3,Kensington_and_Chelsea,51.51732,-0.2005,Entire rental unit,1.0,2.0,91,118.02,0.398479,Restaurant,,,5.666427
4,Westminster,51.49695,-0.13888,Entire rental unit,3.0,4.0,186,132.94,0.200253,Restaurant,Retail,,6.804615


In [11]:
df_full_train, df_test = train_test_split(inside_airbnb_df, test_size=0.2, random_state=33, stratify=inside_airbnb_df['borough'])
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=33, stratify=df_full_train['borough'])

In [12]:
X_train = df_train.drop(['log_price'], axis=1)
y_train = df_train['log_price'].copy()
X_val = df_val.drop(['log_price'], axis=1)
y_val = df_val['log_price'].copy()
X_test = df_test.drop(['log_price'], axis=1)
y_test = df_test['log_price'].copy()

In [13]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Data pipeline

In [14]:
cat_attribs = ['borough', 'property_type', 'amenity_1', 'amenity_2', 'amenity_3']
num_attribs = ['latitude', 'longitude', 'bathrooms', 'bedrooms', 'minimum_nights', 'crime_rate', 'distance_to_station']

In [15]:
num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
)

In [16]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False),
)

In [17]:
preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

In [18]:
X_train_prepared = preprocessing.fit_transform(X_train)  # fitting only occurs here!
X_val_prepared = preprocessing.transform(X_val)          # using 'preprocessing' object to transform data frame
X_test_prepared = preprocessing.transform(X_test)        # using 'preprocessing' object to transform data frame

In [19]:
X_train_prepared_df = pd.DataFrame(
    data=X_train_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_train.index,
)

In [20]:
X_val_prepared_df = pd.DataFrame(
    data=X_val_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_val.index,
)

In [21]:
X_test_prepared_df = pd.DataFrame(
    data=X_test_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=X_test.index,
)

In [22]:
print(f'Training size: {round(len(X_train_prepared_df)/len(inside_airbnb_df), 5):>10}')
print(f'Validation size: {round(len(X_val_prepared_df)/len(inside_airbnb_df), 5):>8}')
print(f'Testing size: {round(len(X_test_prepared_df)/len(inside_airbnb_df), 5):>11}')

Training size:    0.59946
Validation size:  0.20027
Testing size:     0.20027


## Predictive modeling