# Using AWS Autogluon to predict rent prices in Canada based on city, province, latitude, longitude, lease_term, type, price, beds, baths, sq_feet, furnishing, availability_date, smoking, and whether cats and dogs are allowed







In [None]:
!pip install autogluon

Collecting autogluon
  Downloading autogluon-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.1.1 (from autogluon.core[all]==1.1.1->autogluon)
  Downloading autogluon.core-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.features==1.1.1 (from autogluon)
  Downloading autogluon.features-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.1.1 (from autogluon.tabular[all]==1.1.1->autogluon)
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.multimodal==1.1.1 (from autogluon)
  Downloading autogluon.multimodal-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.1.1 (from autogluon.timeseries[all]==1.1.1->autogluon)
  Downloading autogluon.timeseries-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.4.1,>=1.3.0 (from autogluon.core==1.1.1->autogluon.core[all]==1.1.1->autogluon)
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("rentfaster.csv")

In [None]:
df.head()

Unnamed: 0,rentfaster_id,city,province,address,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,link,furnishing,availability_date,smoking,cats,dogs
0,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2495.0,2 Beds,2.5,1403,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True
1,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2695.0,3 Beds,2.5,1496,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True
2,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2295.0,2 Beds,2.5,1180,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True
3,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2095.0,2 Beds,2.5,1403,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,November 18,Non-Smoking,True,True
4,468622,Airdrie,Alberta,69 Gateway Dr NE,51.305962,-114.012515,Long Term,Townhouse,2495.0,2 Beds,2.5,1403,/ab/airdrie/rentals/townhouse/2-bedrooms/pet-f...,Unfurnished,Immediate,Non-Smoking,True,True


In [None]:
df.columns

Index(['city', 'province', 'latitude', 'longitude', 'lease_term', 'type',
       'price', 'beds', 'baths', 'sq_feet', 'furnishing', 'availability_date',
       'smoking', 'cats', 'dogs'],
      dtype='object')

In [None]:
# Remove unnecessary columns
df = df.drop(axis=1, columns = ["address", "link", "rentfaster_id"])

In [None]:
df.head()

Unnamed: 0,city,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,availability_date,smoking,cats,dogs
0,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2495.0,2 Beds,2.5,1403,Unfurnished,Immediate,Non-Smoking,True,True
1,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2695.0,3 Beds,2.5,1496,Unfurnished,Immediate,Non-Smoking,True,True
2,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2295.0,2 Beds,2.5,1180,Unfurnished,Immediate,Non-Smoking,True,True
3,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2095.0,2 Beds,2.5,1403,Unfurnished,November 18,Non-Smoking,True,True
4,Airdrie,Alberta,51.305962,-114.012515,Long Term,Townhouse,2495.0,2 Beds,2.5,1403,Unfurnished,Immediate,Non-Smoking,True,True


In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
# Regressing for the price
target = "price"

In [None]:
train_data = TabularDataset(df)

In [None]:
# Sample 70% randomly for the train data
subsample_size = int(0.7*len(df))

In [None]:
train_data = train_data.sample(n=subsample_size, random_state=0)
train_data.head()

Unnamed: 0,city,province,latitude,longitude,lease_term,type,price,beds,baths,sq_feet,furnishing,availability_date,smoking,cats,dogs
12512,Calgary,Alberta,51.032613,-114.06219,Long Term,Condo Unit,2150.0,1 Bed,1,763,Unfurnished,July 01,Non-Smoking,False,False
24216,Montréal,Quebec,45.505681,-73.563915,Long Term,Apartment,3390.0,2 Beds,1,1058,Unfurnished,Immediate,Non-Smoking,True,True
13161,Calgary,Alberta,50.859881,-114.07801,Long Term,Basement,1300.0,1 Bed,1,700,Unfurnished,July 01,Non-Smoking,True,True
2415,Calgary,Alberta,51.134793,-113.949708,Long Term,Condo Unit,2150.0,2 Beds,2,980,Unfurnished,Immediate,Non-Smoking,False,False
7519,Edmonton,Alberta,53.544671,-113.577309,12 months,House,1700.0,2 Beds,2,800,Unfurnished,July 01,Non-Smoking,True,True


In [None]:
# Training a fast model
predictor_price = TabularPredictor(label=target, path="agModels-predictprice").fit(train_data)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       11.12 GB / 12.67 GB (87.7%)
Disk Space Avail:   64.18 GB / 107.72 GB (59.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "agModels-predictpri

[1000]	valid_set's rmse: 423.81
[2000]	valid_set's rmse: 415.834
[3000]	valid_set's rmse: 410.2
[4000]	valid_set's rmse: 407.48
[5000]	valid_set's rmse: 404.523
[6000]	valid_set's rmse: 402.728
[7000]	valid_set's rmse: 400.988
[8000]	valid_set's rmse: 399.787
[9000]	valid_set's rmse: 399.434
[10000]	valid_set's rmse: 398.293


	-398.293	 = Validation score   (-root_mean_squared_error)
	28.69s	 = Training   runtime
	6.34s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 378.305
[2000]	valid_set's rmse: 365.213
[3000]	valid_set's rmse: 360.246
[4000]	valid_set's rmse: 357.389
[5000]	valid_set's rmse: 355.923
[6000]	valid_set's rmse: 354.684
[7000]	valid_set's rmse: 353.801
[8000]	valid_set's rmse: 353.769
[9000]	valid_set's rmse: 353.433
[10000]	valid_set's rmse: 352.957


	-352.8283	 = Validation score   (-root_mean_squared_error)
	26.76s	 = Training   runtime
	3.41s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-375.0482	 = Validation score   (-root_mean_squared_error)
	25.15s	 = Training   runtime
	0.25s	 = Validation runtime
Fitting model: CatBoost ...
	-359.9289	 = Validation score   (-root_mean_squared_error)
	691.99s	 = Training   runtime
	0.21s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-367.7976	 = Validation score   (-root_mean_squared_error)
	11.08s	 = Training   runtime
	0.23s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-421.5468	 = Validation score   (-root_mean_squared_error)
	34.19s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: XGBoost ...
	-365.5962	 = Validation score   (-root_mean_squared_error)
	72.47s	 = Training   runtime
	0.96s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-523.5231	 = Validation score   (-root_mean_squared_error)
	31.13s	 = Training   runtime
	0.

[1000]	valid_set's rmse: 341.631
[2000]	valid_set's rmse: 336.496
[3000]	valid_set's rmse: 336.635


	-336.3645	 = Validation score   (-root_mean_squared_error)
	18.61s	 = Training   runtime
	1.76s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.429, 'RandomForestMSE': 0.143, 'CatBoost': 0.143, 'KNeighborsDist': 0.095, 'NeuralNetFastAI': 0.095, 'XGBoost': 0.095}
	-312.9316	 = Validation score   (-root_mean_squared_error)
	0.04s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 966.98s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 555.2 rows/s (1804 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("agModels-predictprice")


In [None]:
# Creating test data for the rows that have not been sampled by the train data
test_data = TabularDataset(df.drop(train_data.index))
y_test = test_data[target]

In [None]:
# predictor = TabularPredictor.load("agModels-predictprice")

# Predicting rent prices for the test data
y_pred = predictor_price.predict(test_data)
print("Predictions:  \n", y_pred)
perf = predictor_price.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Predictions:  
 10       2514.037109
11       2052.797852
13       2523.323730
19       2492.110840
24       1985.207031
            ...     
25748    1298.104858
25755    1064.368164
25762     930.289490
25763    1141.226807
25764     930.289490
Name: price, Length: 7732, dtype: float32


In [None]:
# Looking at the performance
perf

{'root_mean_squared_error': -379.06860452004804,
 'mean_squared_error': -143693.0069327766,
 'mean_absolute_error': -169.37559000941798,
 'r2': 0.8486421928585806,
 'pearsonr': 0.9228787141770844,
 'median_absolute_error': -92.62921142578125}

In [None]:
# retraining the model using best_quality and time limit of an hour, focusing on MAE
predictor_price = TabularPredictor(label=target, path="agModels-predictprice",
                                   eval_metric="mean_absolute_error").fit(train_data,
                                   presets="best_quality", time_limit=3600)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.03 GB / 12.67 GB (79.1%)
Disk Space Avail:   63.39 GB / 107.72 GB (58.9%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of t

[36m(_ray_fit pid=13425)[0m [1000]	valid_set's l1: 250.924
[36m(_ray_fit pid=13425)[0m [3000]	valid_set's l1: 235.046[32m [repeated 4x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=13425)[0m [4000]	valid_set's l1: 230.145[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=13425)[0m [6000]	valid_set's l1: 222.024[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=13425)[0m [8000]	valid_set's l1: 216.982[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=13425)[0m [10000]	valid_set's l1: 212.979[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=13746)[0m [1000]	valid_set's l1: 263.006[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=13746)[0m [3000]	valid_set's l1: 245.195[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=13746)[0m [5000]	valid_set's l

[36m(_dystack pid=13261)[0m 	-226.9382	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	257.2s	 = Training   runtime
[36m(_dystack pid=13261)[0m 	107.51s	 = Validation runtime
[36m(_dystack pid=13261)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 309.8s of the 606.2s of remaining time.
[36m(_dystack pid=13261)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (2 workers, per: cpus=1, gpus=0, memory=0.07%)


[36m(_ray_fit pid=14758)[0m [1000]	valid_set's l1: 215.471[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=14758)[0m [2000]	valid_set's l1: 199.903[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=14758)[0m [4000]	valid_set's l1: 189.511[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=14759)[0m [6000]	valid_set's l1: 197.689[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=14759)[0m [8000]	valid_set's l1: 195.636[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=14759)[0m [10000]	valid_set's l1: 193.95[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=15086)[0m [1000]	valid_set's l1: 228.107[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=15086)[0m [3000]	valid_set's l1: 209.37[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=15087)[0m [4000]	valid_set's l1: 207.314[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=15086)[0m [6000]	valid_set's l1: 201.205[32m [repeated 3x across cluster][0m
[36m(_ray_

[36m(_dystack pid=13261)[0m 	-193.1719	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	264.15s	 = Training   runtime
[36m(_dystack pid=13261)[0m 	100.44s	 = Validation runtime
[36m(_dystack pid=13261)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 26.32s of the 322.72s of remaining time.
[36m(_dystack pid=13261)[0m 	-186.8216	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	27.71s	 = Training   runtime
[36m(_dystack pid=13261)[0m 	0.91s	 = Validation runtime
[36m(_dystack pid=13261)[0m Fitting model: WeightedEnsemble_L2 ... Training model for up to 360.0s of the 292.16s of remaining time.
[36m(_dystack pid=13261)[0m 	Ensemble Weights: {'RandomForestMSE_BAG_L1': 0.529, 'LightGBM_BAG_L1': 0.412, 'KNeighborsDist_BAG_L1': 0.059}
[36m(_dystack pid=13261)[0m 	-173.3011	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	0.08s	 = Training   runtime
[36m(_dystack pid=13261)[0

[36m(_ray_fit pid=16241)[0m [1000]	valid_set's l1: 185.608[32m [repeated 3x across cluster][0m
[36m(_ray_fit pid=16241)[0m [3000]	valid_set's l1: 183.47[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=16241)[0m [4000]	valid_set's l1: 182.164[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=16241)[0m [6000]	valid_set's l1: 181.542[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=16241)[0m [7000]	valid_set's l1: 181.557[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=16241)[0m [9000]	valid_set's l1: 180.699[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=16584)[0m [1000]	valid_set's l1: 185.308[32m [repeated 4x across cluster][0m
[36m(_ray_fit pid=16584)[0m [2000]	valid_set's l1: 183.635[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=16584)[0m [4000]	valid_set's l1: 181.667[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=16584)[0m [5000]	valid_set's l1: 180.901
[36m(_ray_fit pid=16584)[0m [6000]	valid_set's 

[36m(_dystack pid=13261)[0m 	-179.6073	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	156.92s	 = Training   runtime
[36m(_dystack pid=13261)[0m 	41.49s	 = Validation runtime
[36m(_dystack pid=13261)[0m Fitting model: LightGBM_BAG_L2 ... Training model for up to 123.09s of the 123.06s of remaining time.
[36m(_dystack pid=13261)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (2 workers, per: cpus=1, gpus=0, memory=0.10%)
[36m(_dystack pid=13261)[0m 	-175.881	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	36.0s	 = Training   runtime
[36m(_dystack pid=13261)[0m 	0.69s	 = Validation runtime
[36m(_dystack pid=13261)[0m Fitting model: RandomForestMSE_BAG_L2 ... Training model for up to 83.55s of the 83.52s of remaining time.
[36m(_dystack pid=13261)[0m 	-161.1856	 = Validation score   (-mean_absolute_error)
[36m(_dystack pid=13261)[0m 	70.86s	 = Training   runtime
[36m(_dysta

In [None]:
test_data = TabularDataset(df.drop(train_data.index))
y_test = test_data[target]
# predictor = TabularPredictor.load("agModels-predictprice")

y_pred = predictor_price.predict(test_data)
print("Predictions:  \n", y_pred)
perf = predictor_price.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Predictions:  
 10       2625.232422
11       1909.255127
13       2749.968750
19       2288.382324
24       2052.271484
            ...     
25748    1304.355591
25755    1043.405640
25762     905.823364
25763    1110.603394
25764     905.823364
Name: price, Length: 7732, dtype: float32


In [None]:
# Take a look at the performance of the model
perf

{'mean_absolute_error': -153.4067275796594,
 'root_mean_squared_error': -380.5164302788771,
 'mean_squared_error': -144792.75371217958,
 'r2': 0.8474837839387982,
 'pearsonr': 0.9212355570349564,
 'median_absolute_error': -72.9281005859375}

In [None]:
pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})

Unnamed: 0,y_test,y_pred
10,1930.0,2625.232422
11,1700.0,1909.255127
13,3150.0,2749.968750
19,2300.0,2288.382324
24,1910.0,2052.271484
...,...,...
25748,1305.0,1304.355591
25755,1085.0,1043.405640
25762,945.0,905.823364
25763,1025.0,1110.603394


In [None]:
from google.colab import files
files.download('/content/agModels-predictprice/models/WeightedEnsemble_L3')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>