In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("houses_for_rent_madrid.xlsx")

In [3]:
data.dtypes

Unnamed: 0,0
Id,int64
District,object
Address,object
Number,object
Area,object
Rent,int64
Bedrooms,float64
Sq.Mt,int64
Floor,float64
Outer,float64


In [4]:
# Data preparation: Remove Number, Address, Id, drop rows with missing values,
# convert, get one-hot (dummy) encoding for the categoricals.
# Split to 80%-20% train-test.
from sklearn.model_selection import train_test_split
data_prepared = data.drop(columns=["Number", "Address", "Id"])
data_prepared = data_prepared.dropna(axis=0)
data_prepared["District"] = data["District"].astype("category")
data_prepared["Area"] = data["Area"].astype("category")
data_prepared = pd.get_dummies(data_prepared)
data_train, data_test = train_test_split(data_prepared, train_size = 0.8)

In [5]:
# How many features do we have now?
data_train.shape

(1452, 170)

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

## Your job
Train a regressor for "Rent" using decision trees.  Start with no restriction on the tree growth, and then try to fiddle with the "max_depth" parameter
(to control the depth of the tree) and the "min_samples_split", which prevents splitting nodes that have too little training data in them.  Use a real number, for example, 0.05 means that the algorithm won't split nodes with less than 0.05*n training samples, where n is the size of the training set.

Either using a loop or manually, find the best choice of min_samples_split and max_depth on the test set (which is here used as a validation set only).

In [7]:
# Documentation for DecisionTreeRegressor:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
clf = DecisionTreeRegressor()
X_train = data_train.drop(columns=["Rent"])
y_train = data_train["Rent"]
X_test = data_test.drop(columns=["Rent"])
y_test = data_test["Rent"]
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("Initial R2 score:", r2)

# Experiment with different hyperparameters
best_r2 = r2
best_depth = None
best_min_samples = None
for depth in range(1, 20):
  for min_samples in np.linspace(0.01, 0.5, 10):
    clf = DecisionTreeRegressor(max_depth=depth, min_samples_split=min_samples)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    current_r2 = r2_score(y_test, y_pred)

    if current_r2 > best_r2:
      best_r2 = current_r2
      best_depth = depth
      best_min_samples = min_samples
      print(f"New best R2 score: {best_r2:.4f}, max_depth: {best_depth}, min_samples_split: {best_min_samples:.4f}")

print(f"\nBest hyperparameters: max_depth = {best_depth}, min_samples_split = {best_min_samples:.4f}")
print(f"Best R2 score: {best_r2:.4f}")

Initial R2 score: 0.6673161351958294
New best R2 score: 0.6795, max_depth: 5, min_samples_split: 0.0100
New best R2 score: 0.6970, max_depth: 7, min_samples_split: 0.0100
New best R2 score: 0.7224, max_depth: 10, min_samples_split: 0.0100

Best hyperparameters: max_depth = 10, min_samples_split = 0.0100
Best R2 score: 0.7224
