# Price prediction of short-term flat rentals in London

* [Data preparation](#Data-preparation)
* [Predictive modeling](#Predictive-modeling)

In [1]:
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
pd.options.display.max_rows = None

In [3]:
home_dir = Path.home()
inside_airbnb_data_dir = home_dir / 'Programming/data/inside-airbnb/london'
inside_airbnb_work_dir = home_dir / 'Programming/Python/machine-learning-exercises/short-term-rents-in-london'
plots_dir = inside_airbnb_work_dir / 'plots'
hist_dir = plots_dir / 'histograms'
hist_dir.mkdir(parents=True, exist_ok=True)

In [4]:
inside_airbnb_data_file = inside_airbnb_data_dir / 'selected_short_term_rentals_with_distances.csv'
inside_airbnb_df = pd.read_csv(inside_airbnb_data_file, keep_default_na=False, thousands=',')

## Data preparation

In [5]:
inside_airbnb_df.drop(['room_type', 'nearest_station'], axis=1, inplace=True)
inside_airbnb_df['borough'] = inside_airbnb_df['borough'].replace({r'\s': r'_'}, regex=True)

In [6]:
mplstyle_file = inside_airbnb_work_dir / 'barplot-style.mplstyle'
plt.style.use(mplstyle_file)

## Predictive modeling

In [7]:
inside_airbnb_df = inside_airbnb_df.loc[inside_airbnb_df['borough'] != 'Sutton']

In [8]:
inside_airbnb_df[['amenity_1', 'amenity_2', 'amenity_3']] = inside_airbnb_df['amenities'].str.split(',', expand=True)
inside_airbnb_df = inside_airbnb_df.drop('amenities', axis=1)

In [9]:
inside_airbnb_df.head()

Unnamed: 0,borough,latitude,longitude,property_type,bathrooms,bedrooms,price,minimum_nights,crime_rate,distance_to_station,amenity_1,amenity_2,amenity_3
0,Lambeth,51.491476,-0.111514,Entire rental unit,1.0,1.0,150.0,30,137.98,0.520193,,,
1,Kensington_and_Chelsea,51.48566,-0.18415,Entire rental unit,2.0,2.0,195.0,91,118.02,0.956708,Nightlife,Restaurant,
2,Brent,51.53899,-0.19744,Entire rental unit,1.5,2.0,83.0,30,117.59,0.481797,Grocery Store,,
3,Kensington_and_Chelsea,51.51732,-0.2005,Entire rental unit,1.0,2.0,288.0,91,118.02,0.398479,Restaurant,,
4,Westminster,51.49695,-0.13888,Entire rental unit,3.0,4.0,901.0,186,132.94,0.200253,Restaurant,Retail,


In [10]:
cat_attribs = ['borough', 'property_type', 'amenity_1', 'amenity_2', 'amenity_3']
num_attribs = ['latitude', 'longitude', 'bathrooms', 'bedrooms', 'minimum_nights', 'crime_rate', 'distance_to_station']

In [11]:
num_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
)

In [12]:
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False),
)

In [13]:
preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

In [14]:
inside_airbnb_prepared_arr = preprocessing.fit_transform(inside_airbnb_df)

In [15]:
inside_airbnb_prepared_df = pd.DataFrame(
    data=inside_airbnb_prepared_arr,
    columns=preprocessing.get_feature_names_out(),
    index=inside_airbnb_df.index,
)

In [None]:
df_full_train, df_test = train_test_split(inside_airbnb_prepared_df, test_size=0.2, random_state=33, stratify=inside_airbnb_prepared_df['borough'])
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=33, stratify=df_full_train['borough'])

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
X_train = df_train.drop('price', axis=1)
X_val = df_val.drop('price', axis=1)
X_test = df_test.drop('price', axis=1)

In [None]:
y_train = df_train['price'].copy()
y_val = df_val['price'].copy()
y_test = df_test['price'].copy()

In [None]:
X_train.tail()