In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("houses_for_rent_madrid.xlsx")

In [3]:
data.dtypes

Id                int64
District         object
Address          object
Number           object
Area             object
Rent              int64
Bedrooms        float64
Sq.Mt             int64
Floor           float64
Outer           float64
Elevator        float64
Penthouse         int64
Cottage           int64
Duplex            int64
Semidetached      int64
dtype: object

In [4]:
# Data preparation: Remove Number, Address, Id, drop rows with missing values,
# convert, get one-hot (dummy) encoding for the categoricals.
# Split to 80%-20% train-test.
from sklearn.model_selection import train_test_split
data_prepared = data.drop(columns=["Number", "Address", "Id"])
data_prepared = data_prepared.dropna(axis=0)
data_prepared["District"] = data["District"].astype("category")
data_prepared["Area"] = data["Area"].astype("category")
data_prepared = pd.get_dummies(data_prepared)
data_train, data_test = train_test_split(data_prepared, train_size = 0.8)

In [5]:
# How many features do we have now?
data_train.shape

(1452, 170)

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

## Your job
Train a regressor for "Rent" using decision trees.  Start with no restriction on the tree growth, and then try to fiddle with the "max_depth" parameter
(to control the depth of the tree) and the "min_samples_split", which prevents splitting nodes that have too little training data in them.  Use a real number, for example, 0.05 means that the algorithm won't split nodes with less than 0.05*n training samples, where n is the size of the training set.

Either using a loop or manually, find the best choice of min_samples_split and max_depth on the test set (which is here used as a validation set only).

In [10]:
# Documentation for DecisionTreeRegressor:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor

clf = DecisionTreeRegressor(max_depth=5, min_samples_split=0.05)
clf.fit(data_train.drop(columns=["Rent"]), data_train["Rent"])
predictions = clf.predict(data_test.drop(columns=["Rent"]))
r2_score(data_test["Rent"], predictions)

0.6615475655472474

In [11]:

best_r2 = 0
best_max_depth = 0
best_min_samples_split = 0

for max_depth in range(1, 20):
    for min_samples_split in np.linspace(0.01, 0.5, 50):
        clf = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)
        clf.fit(data_train.drop(columns=["Rent"]), data_train["Rent"])
        predictions = clf.predict(data_test.drop(columns=["Rent"]))
        r2 = r2_score(data_test["Rent"], predictions)
        if r2 > best_r2:
            best_r2 = r2
            best_max_depth = max_depth
            best_min_samples_split = min_samples_split
            print("Best r2 so far: ", best_r2, " with max_depth: ", best_max_depth, " and min_samples_split: ", best_min_samples_split)

Best r2 so far:  0.45521438356165234  with max_depth:  1  and min_samples_split:  0.01
Best r2 so far:  0.5928605596613451  with max_depth:  2  and min_samples_split:  0.01
Best r2 so far:  0.5972988738165249  with max_depth:  2  and min_samples_split:  0.12
Best r2 so far:  0.648559698180802  with max_depth:  3  and min_samples_split:  0.01
Best r2 so far:  0.6514954220316264  with max_depth:  4  and min_samples_split:  0.01
Best r2 so far:  0.6521962164803873  with max_depth:  4  and min_samples_split:  0.02
Best r2 so far:  0.6615591659395506  with max_depth:  4  and min_samples_split:  0.03
Best r2 so far:  0.6622599603883115  with max_depth:  4  and min_samples_split:  0.04
Best r2 so far:  0.6625230691584973  with max_depth:  5  and min_samples_split:  0.03
Best r2 so far:  0.6627612566657555  with max_depth:  5  and min_samples_split:  0.04
Best r2 so far:  0.666361892325138  with max_depth:  5  and min_samples_split:  0.08
Best r2 so far:  0.6679921813634806  with max_depth:  7