# 1. Data collection

In [72]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pickle as pk

In [38]:
data = pd.read_csv("./data/rent_apartments.csv")

data

Unnamed: 0,address,area,construction_year,rooms,bedrooms,bathrooms,balcony,storage,parking,furnished,garage,garden,energy,facilities,zip,neighborhood,rent
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,yes,no,no,yes,no,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,yes,no,yes,yes,no,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,yes,no,no,yes,no,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,no,no,no,no,no,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,1033 DL Amsterdam (Terrasdorp),75.0,1990,3,2,1,no,no,no,yes,no,Not present,C,,1033 DL,Terrasdorp,1450
1724,1033 DZ Amsterdam (Terrasdorp),75.0,1990,3,2,1,yes,no,no,yes,no,Not present,C,Shower,1033 DZ,Terrasdorp,1500
1725,1021 NX Amsterdam (IJplein e.o.),74.0,1986,2,1,1,no,no,no,yes,no,Not present,,,1021 NX,IJplein e.o.,1400
1726,1021 EC Amsterdam (Vogelbuurt Zuid),118.0,1920,5,4,1,yes,yes,yes,yes,no,Not present,G,"Storage space, Shower, Toilet",1021 EC,Vogelbuurt Zuid,2650


# 2. Data preparation

In [39]:
data.dtypes

address               object
area                 float64
construction_year      int64
rooms                  int64
bedrooms               int64
bathrooms              int64
balcony               object
storage               object
parking               object
furnished             object
garage                object
garden                object
energy                object
facilities            object
zip                   object
neighborhood          object
rent                   int64
dtype: object

In [40]:
#Encoding some of the categorical variables
data_encoded = pd.get_dummies(data, columns = ['balcony', 'parking', 'furnished', 'garage', 'storage'], drop_first=True)

data_encoded

Unnamed: 0,address,area,construction_year,rooms,bedrooms,bathrooms,garden,energy,facilities,zip,neighborhood,rent,balcony_yes,parking_yes,furnished_yes,garage_yes,storage_yes
0,1071 HN Amsterdam (Cornelis Schuytbuurt),167.0,1870,3,2,2,Not present,D,Roof terrace,1071 HN,Cornelis Schuytbuurt,4500,True,False,True,False,False
1,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450,True,True,True,False,False
2,1071 HK Amsterdam (Concertgebouwbuurt),150.0,1890,3,2,2,Not present,A,"Cable TV, Internet connection, Fireplace, Bath...",1071 HK,Concertgebouwbuurt,3450,True,True,True,False,False
3,1071 WV Amsterdam (Hondecoeterbuurt),90.0,1923,3,2,1,Not present,,"Shower, Toilet",1071 WV,Hondecoeterbuurt,2000,True,False,True,False,False
4,1071 WV Amsterdam (Hondecoeterbuurt),104.0,1923,3,2,1,Present (47 m²),D,"Shower, Bath, Toilet",1071 WV,Hondecoeterbuurt,3250,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1723,1033 DL Amsterdam (Terrasdorp),75.0,1990,3,2,1,Not present,C,,1033 DL,Terrasdorp,1450,False,False,True,False,False
1724,1033 DZ Amsterdam (Terrasdorp),75.0,1990,3,2,1,Not present,C,Shower,1033 DZ,Terrasdorp,1500,True,False,True,False,False
1725,1021 NX Amsterdam (IJplein e.o.),74.0,1986,2,1,1,Not present,,,1021 NX,IJplein e.o.,1400,False,False,True,False,False
1726,1021 EC Amsterdam (Vogelbuurt Zuid),118.0,1920,5,4,1,Not present,G,"Storage space, Shower, Toilet",1021 EC,Vogelbuurt Zuid,2650,True,True,True,False,True


In [41]:
data_encoded.garden.unique()

array(['Not present', 'Present (47 m²)', 'Present (29 m²)',
       'Present (75 m²)', 'Present (40 m², located on the north)',
       'Present (50 m²)', 'Present (20 m², located on the south)',
       'Present (1 m²)', 'Present (15 m²)', 'Present (25 m²)',
       'Present (12 m²)', 'Present (45 m², located on the south)',
       'Present (26 m², located on the south-east)',
       'Present (20 m², located on the north-east)',
       'Present (42 m², located on the west)', 'Present (46 m²)',
       'Present (45 m², located on the south-west)',
       'Present (60 m², located on the south-west)',
       'Present (50 m², located on the south)',
       'Present (40 m², located on the north-east)', 'Present (16 m²)',
       'Present (60 m²)', 'Present (65 m², located on the south)',
       'Present (90 m²)', 'Present (85 m²)',
       'Present (85 m², located on the south-west)',
       'Present (500 m², located on the west)',
       'Present (45 m², located on the west)',
       'Present (1

We have to extract numeric value from the garden columns, this can be done by using regex

In [42]:
data_encoded.garden[4]

'Present (47 m²)'

In [43]:
int(re.findall(r'\d+', data_encoded.garden[4])[0])

47

In [44]:
for i in range(len(data_encoded)):
    if data_encoded.garden[i] == "Not present":
        data_encoded.garden[i] = 0
    else:
        data_encoded.garden[i] = int(re.findall(r'\d+', data_encoded.garden[i])[0])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data_encoded.garden[i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_encoded.garden[i] = 0
You are s

In [45]:
data_encoded.garden.unique()

array([0, 47, 29, 75, 40, 50, 20, 1, 15, 25, 12, 45, 26, 42, 46, 60, 16,
       65, 90, 85, 500, 30, 49, 51, 80, 27, 56, 9, 200, 32, 100, 34],
      dtype=object)

# 3. Model Building

## 3.1. Defining X and y

In [49]:
X = data_encoded[['area', 'construction_year', 'bedrooms', 'garden', 'balcony_yes', 'parking_yes', 'furnished_yes', 'garage_yes', 'storage_yes']]

In [50]:
y = data_encoded.rent

In [51]:
X

Unnamed: 0,area,construction_year,bedrooms,garden,balcony_yes,parking_yes,furnished_yes,garage_yes,storage_yes
0,167.0,1870,2,0,True,False,True,False,False
1,150.0,1890,2,0,True,True,True,False,False
2,150.0,1890,2,0,True,True,True,False,False
3,90.0,1923,2,0,True,False,True,False,False
4,104.0,1923,2,47,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
1723,75.0,1990,2,0,False,False,True,False,False
1724,75.0,1990,2,0,True,False,True,False,False
1725,74.0,1986,1,0,False,False,True,False,False
1726,118.0,1920,4,0,True,True,True,False,True


In [52]:
y

0       4500
1       3450
2       3450
3       2000
4       3250
        ... 
1723    1450
1724    1500
1725    1400
1726    2650
1727    2600
Name: rent, Length: 1728, dtype: int64

## 3.2. Split the dataset

In [57]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2)

## 3.3. Model Building

In [59]:
rf = RandomForestRegressor()

In [60]:
rf.fit(X_train, y_train)

In [61]:
rf.score(X_test, y_test)

0.6239163882801557

## 3.4. Predicting

In [62]:
X

Unnamed: 0,area,construction_year,bedrooms,garden,balcony_yes,parking_yes,furnished_yes,garage_yes,storage_yes
0,167.0,1870,2,0,True,False,True,False,False
1,150.0,1890,2,0,True,True,True,False,False
2,150.0,1890,2,0,True,True,True,False,False
3,90.0,1923,2,0,True,False,True,False,False
4,104.0,1923,2,47,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
1723,75.0,1990,2,0,False,False,True,False,False
1724,75.0,1990,2,0,True,False,True,False,False
1725,74.0,1986,1,0,False,False,True,False,False
1726,118.0,1920,4,0,True,True,True,False,True


In [63]:
rf.predict([[85, 2015, 2, 20, 1, 1, 0, 0,1]])



array([2325.51166667])

## 3.5. Tuning Hyperparameters

In [68]:
grid_space = {'n_estimators': [100, 200, 300], 'max_depth': [3, 6, 9, 12]}

In [69]:
grid = GridSearchCV(RandomForestRegressor(), param_grid=grid_space, cv=5, scoring='r2')

In [71]:
model_grid = grid.fit(X_train, y_train)

print(f"Best hyperparameters are {model_grid.best_params_}, score = {model_grid.best_score_}")

Best hyperparameters are {'max_depth': 9, 'n_estimators': 300}, score = 0.6933351220385212


# 4. Model Management

In [74]:
pk.dump(rf, open('models/rf_v1', 'wb'))

In [77]:
rf_v1 = pk.load(open('models/rf_v1', 'rb'))