# Predict Aparatment Prices with Random Forest Regression

GOAL: Beat best Linear Model trained previously
- MAE: ~18600
- RMSE: ~23400

In [2]:
# Data Loading
import pickle

# Data Manipulation
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# RandomForest Regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import math

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
with open('data/cluj_prices_lm.pkl','rb') as f:
    df = pickle.load(f)

df.shape

(161, 11)

In [9]:
df.head()

Unnamed: 0,price_euro,rooms,size,bathrooms,terrace,pass_through,finished,height_pos,neighbourhood,year_built,zone
0,133425,2,53.0,1,1,False,not finished,0.17,Intre Lacuri,2024,other
1,168675,3,67.0,2,2,False,not finished,0.17,Intre Lacuri,2024,other
2,177825,3,69.0,2,2,False,not finished,0.17,Intre Lacuri,2024,other
3,168675,3,67.0,2,2,False,not finished,0.17,Gheorgheni,2024,other
4,133725,2,53.0,1,0,True,not finished,0.17,Intre Lacuri,2024,other


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 161 entries, 0 to 314
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price_euro     161 non-null    int64  
 1   rooms          161 non-null    int64  
 2   size           161 non-null    float64
 3   bathrooms      161 non-null    int64  
 4   terrace        161 non-null    int64  
 5   pass_through   161 non-null    bool   
 6   finished       161 non-null    object 
 7   height_pos     161 non-null    float64
 8   neighbourhood  161 non-null    object 
 9   year_built     161 non-null    int32  
 10  zone           161 non-null    object 
dtypes: bool(1), float64(2), int32(1), int64(4), object(3)
memory usage: 13.4+ KB


In [13]:
# fix text input values for "finished", "zone", "neigbourhoid"

df.neighbourhood = df.neighbourhood.str.lower()
df.finished = df.finished.str.lower()
df.zone = df.zone.str.lower()

In [16]:
le = LabelEncoder()

df.neighbourhood = le.fit_transform(df.neighbourhood)
df.finished = le.fit_transform(df.finished)
df.zone = le.fit_transform(df.zone)

In [17]:
df.head()

Unnamed: 0,price_euro,rooms,size,bathrooms,terrace,pass_through,finished,height_pos,neighbourhood,year_built,zone
0,133425,2,53.0,1,1,False,3,0.17,8,2024,1
1,168675,3,67.0,2,2,False,3,0.17,8,2024,1
2,177825,3,69.0,2,2,False,3,0.17,8,2024,1
3,168675,3,67.0,2,2,False,3,0.17,5,2024,1
4,133725,2,53.0,1,0,True,3,0.17,8,2024,1


In [18]:
df.dtypes

price_euro         int64
rooms              int64
size             float64
bathrooms          int64
terrace            int64
pass_through        bool
finished           int64
height_pos       float64
neighbourhood      int64
year_built         int32
zone               int64
dtype: object

In [19]:
# Convert categorical numbers into "category" type

df.neighbourhood = df.neighbourhood.astype('category')
df.finished = df.finished.astype('category')
df.zone = df.zone.astype('category')

In [21]:
df.dtypes

price_euro          int64
rooms               int64
size              float64
bathrooms           int64
terrace             int64
pass_through         bool
finished         category
height_pos        float64
neighbourhood    category
year_built          int32
zone             category
dtype: object