In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
!pip3 install -U pip
# !pip3 install -U setuptools wheel
!pip3 install autogluon

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data=pd.read_csv('drive/MyDrive/immo_data_clean.csv')
handy_data=data.copy()

In [None]:
def get_rid_outl(df, list_num_var, percentile):
    """
    INPUT: 
    df: Dataframe
    list_num_var: List of the variables (columns) with outliers that need to be deleted
    percentile: float between 0 and 100: Percentage of the data that will be kept 
    
    OUTPUT:
    DataFrame with the defined amount of data. This means a DataFrame without outliers
    """
    
    perc_dict = {}
    for col in list_num_var: # first, calculate all the percentiles before removing any
        # row, otherwise you will remove more rows than necessary
        value_perc = np.nanpercentile(df[col], percentile)
        value_perc_dict = {col: value_perc}
        perc_dict.update(value_perc_dict)
        
    for var in list_num_var:    
        df = df.loc[(df[var] <= perc_dict[var]) ^ (df[var].isnull())] # include nan values
    
    return df

In [None]:
handy_data.drop(columns=['regionLevel3'], inplace=True)

In [None]:
handy_data.loc[handy_data.yearConstructed == "NO_INFORMATION", "yearConstructed"] = '0'
handy_data['yearConstructed']=handy_data['yearConstructed'].astype(int)
handy_data.loc[handy_data.yearConstructed == 0, "yearConstructed"] = None

In [None]:
handy_data['yearConstructed'] = handy_data['yearConstructed'].fillna(handy_data['yearConstructed'].median())

In [None]:
# handy_data.loc[handy_data.yearConstructed == "NO_INFORMATION", "yearConstructed"] = '0'
# handy_data['yearConstructed']=handy_data['yearConstructed'].astype(int)
# handy_data.loc[handy_data.yearConstructed == 0, "yearConstructed"] = None

In [None]:
handy_data.drop(columns=['totalRent',
                 'serviceCharge',
                 'heatingCosts'], inplace=True)

# rename the response variable to rent
handy_data.rename(columns={'rent_incl_hc': 'rent'}, inplace=True)

# remove all rows where living space is 0
handy_data = handy_data[handy_data['livingSpace'] != 0]

# create a new variable for the rent/livingSpace (rent/m2)
handy_data['rent_m2'] = handy_data['rent'] / handy_data['livingSpace']

# get rid of outliers
handy_data = get_rid_outl(handy_data, ['rent_m2'], 99.8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [None]:
handy_data.typeOfFlat.replace(np.nan, 'NO_INFORMATION', inplace=True)
handy_data.interiorQuality.replace(np.nan, 'NO_INFORMATION', inplace=True)

In [None]:
# first transform all categorical values into dummies
list_cat_vars = handy_data.select_dtypes(include = ['object']).columns

for var in  list_cat_vars:
    # for each cat add dummy var, drop original column
    handy_data = pd.concat([handy_data.drop(var, axis=1), pd.get_dummies(handy_data[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)

handy_data.describe()

Unnamed: 0.1,Unnamed: 0,yearConstructed,noParkSpaces,livingSpace,postcode,noRooms,rent,rent_m2,regionLevel1_Bayern,regionLevel1_Berlin,...,lastRefurbish_2012,lastRefurbish_2013,lastRefurbish_2014,lastRefurbish_2015,lastRefurbish_2016,lastRefurbish_2017,lastRefurbish_2018,lastRefurbish_2019,lastRefurbish_2020,lastRefurbish_NO_INFORMATION
count,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,...,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0,216043.0
mean,133560.294895,1966.829951,0.353383,70.739011,36494.551043,2.567847,774.304525,11.049989,0.079674,0.038631,...,0.009105,0.009628,0.013812,0.018182,0.022778,0.028499,0.051161,0.077887,0.014692,0.682582
std,77625.115921,42.516913,0.546004,26.509316,27943.225308,0.899165,424.253415,4.570212,0.270788,0.192715,...,0.094983,0.097648,0.116711,0.133608,0.149195,0.166394,0.220327,0.267995,0.120315,0.465473
min,0.0,1000.0,0.0,1.0,852.0,1.0,1.0,0.009455,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,66290.0,1957.0,0.0,53.63,9119.0,2.0,474.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,133074.0,1972.0,0.0,66.22,37603.0,2.5,645.0,9.796944,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,200680.0,1991.0,1.0,84.835,55543.0,3.0,955.005,12.686179,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,268848.0,2020.0,4.0,180.0,99998.0,5.0,2980.0,44.444444,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
handy_data_new=handy_data[[feat for feat in handy_data.columns.to_list() if feat not in ['Unnamed: 0']] ]

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
y = handy_data_new['rent']
X = handy_data_new[[i for i in handy_data_new.columns if i not in ['rent', 'rent_m2']]]
train, test = train_test_split(pd.concat([X,y],axis=1), test_size=0.2,random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# data_train=pd.concat([X_train])
# data_test=pd.concate([],axis=1) 
train_data = TabularDataset(train)
test_data = TabularDataset(test)

predictor = TabularPredictor(label='rent').fit(train_data=train_data)
predictions = predictor.predict(test_data)


No path specified. Models will be saved in: "AutogluonModels/ag-20220424_125519/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220424_125519/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    172834
Train Data Columns: 585
Label Column: rent
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (2980.0, 1.0, 774.73374, 424.29511)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generator

[1000]	valid_set's rmse: 130.459
[2000]	valid_set's rmse: 126.853
[3000]	valid_set's rmse: 125.707
[4000]	valid_set's rmse: 124.633
[5000]	valid_set's rmse: 124.186
[6000]	valid_set's rmse: 123.86
[7000]	valid_set's rmse: 123.552
[8000]	valid_set's rmse: 123.466
[9000]	valid_set's rmse: 123.43
[10000]	valid_set's rmse: 123.565


	-123.3673	 = Validation score   (root_mean_squared_error)
	63.25s	 = Training   runtime
	0.56s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 130.82
[2000]	valid_set's rmse: 127.045
[3000]	valid_set's rmse: 125.31
[4000]	valid_set's rmse: 124.225
[5000]	valid_set's rmse: 123.662
[6000]	valid_set's rmse: 123.308
[7000]	valid_set's rmse: 122.712
[8000]	valid_set's rmse: 122.332
[9000]	valid_set's rmse: 122.154
[10000]	valid_set's rmse: 122.257


	-122.1138	 = Validation score   (root_mean_squared_error)
	59.96s	 = Training   runtime
	0.58s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-135.4726	 = Validation score   (root_mean_squared_error)
	1062.6s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: CatBoost ...
	-122.6609	 = Validation score   (root_mean_squared_error)
	209.23s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-139.1403	 = Validation score   (root_mean_squared_error)
	1292.18s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-166.7265	 = Validation score   (root_mean_squared_error)
	315.96s	 = Training   runtime
	0.42s	 = Validation runtime
Fitting model: XGBoost ...
	-122.0132	 = Validation score   (root_mean_squared_error)
	90.7s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-128.4376	 = Validation score   (root_mean_squared_error)
	278.67s	 = Training   runtime
	0.04s	

[1000]	valid_set's rmse: 126.082
[2000]	valid_set's rmse: 124.238
[3000]	valid_set's rmse: 123.357
[4000]	valid_set's rmse: 122.702
[5000]	valid_set's rmse: 122.206
[6000]	valid_set's rmse: 122.163
[7000]	valid_set's rmse: 122.161


	-122.094	 = Validation score   (root_mean_squared_error)
	69.76s	 = Training   runtime
	0.62s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-114.9388	 = Validation score   (root_mean_squared_error)
	0.46s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 3524.99s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20220424_125519/")


In [None]:
predictions

4884       444.293488
104038     306.753876
120243     847.210449
187574     593.799866
151814     773.240845
             ...     
178738     400.591339
3959       427.984039
178586     565.712891
57720      432.221649
125319    1805.605347
Name: rent, Length: 43209, dtype: float32

In [None]:
predictor.leaderboard(test_data) 


                  model   score_test   score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         LightGBMLarge  -122.141506 -122.093997       10.636951       0.622695    69.757763                10.636951                0.622695          69.757763            1       True         11
1              LightGBM  -123.672004 -122.113771       10.483294       0.582208    59.963564                10.483294                0.582208          59.963564            1       True          4
2            LightGBMXT  -124.378071 -123.367349        9.989046       0.561195    63.245750                 9.989046                0.561195          63.245750            1       True          3
3              CatBoost  -124.904606 -122.660932        0.412551       0.035692   209.227426                 0.412551                0.035692         209.227426            1       True          6
4               XGBo

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMLarge,-122.141506,-122.093997,10.636951,0.622695,69.757763,10.636951,0.622695,69.757763,1,True,11
1,LightGBM,-123.672004,-122.113771,10.483294,0.582208,59.963564,10.483294,0.582208,59.963564,1,True,4
2,LightGBMXT,-124.378071,-123.367349,9.989046,0.561195,63.24575,9.989046,0.561195,63.24575,1,True,3
3,CatBoost,-124.904606,-122.660932,0.412551,0.035692,209.227426,0.412551,0.035692,209.227426,1,True,6
4,XGBoost,-125.26603,-122.013174,7.148272,0.197046,90.69863,7.148272,0.197046,90.69863,1,True,9
5,NeuralNetTorch,-133.909082,-128.437638,0.688438,0.041048,278.666774,0.688438,0.041048,278.666774,1,True,10
6,RandomForestMSE,-137.324433,-135.472569,6.178284,0.307883,1062.599535,6.178284,0.307883,1062.599535,1,True,5
7,ExtraTreesMSE,-138.630671,-139.140273,18.102433,0.308351,1292.181758,18.102433,0.308351,1292.181758,1,True,7
8,KNeighborsDist,-148.392818,-148.787251,2.223497,0.105962,0.208704,2.223497,0.105962,0.208704,1,True,2
9,KNeighborsUnif,-152.177927,-152.218864,0.391479,0.104403,0.203919,0.391479,0.104403,0.203919,1,True,1


In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
y = handy_data_new['rent']
X = handy_data_new[[i for i in handy_data_new.columns if i not in ['rent', 'rent_m2']]]
train, test = train_test_split(pd.concat([X,y],axis=1), test_size=0.2,random_state=42)
train_data = TabularDataset(train)
test_data = TabularDataset(test)
save_path='drive/MyDrive/final_outs_agModels-predictClass'
predictor = TabularPredictor(label='rent',eval_metric='r2',path=save_path).fit(train_data=train_data,ag_args_fit={'num_gpus': 1})
predictions = predictor.predict(test_data)
perf = predictor.evaluate_predictions(y_true=test_data['rent'], y_pred=predictions, auxiliary_metrics=True)
perf
# [‘root_mean_squared_error’, ‘mean_squared_error’, ‘mean_absolute_error’, ‘median_absolute_error’, ‘r2’]

	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "drive/MyDrive/final_outs_agModels-predictClass/"
AutoGluon Version:  0.4.0
Python Version:     3.7.13
Operating System:   Linux
Train Data Rows:    172834
Train Data Columns: 585
Label Column: rent
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (2980.0, 1.0, 774.73374, 424.29511)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...

[1000]	valid_set's l2: 16953.8	valid_set's r2: 0.899929
[2000]	valid_set's l2: 16163	valid_set's r2: 0.9046
[3000]	valid_set's l2: 15852.1	valid_set's r2: 0.906437
[4000]	valid_set's l2: 15696.5	valid_set's r2: 0.907339
[5000]	valid_set's l2: 15611.7	valid_set's r2: 0.907841
[6000]	valid_set's l2: 15485.3	valid_set's r2: 0.908579
[7000]	valid_set's l2: 15400.1	valid_set's r2: 0.909096
[8000]	valid_set's l2: 15384.3	valid_set's r2: 0.909197
[9000]	valid_set's l2: 15423.9	valid_set's r2: 0.908969
[10000]	valid_set's l2: 15460.5	valid_set's r2: 0.908747


	0.9093	 = Validation score   (r2)
	70.06s	 = Training   runtime
	0.57s	 = Validation runtime
Fitting model: LightGBM ...
	Training LightGBM with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's l2: 17113.9	valid_set's r2: 0.898977
[2000]	valid_set's l2: 16140.4	valid_set's r2: 0.904745
[3000]	valid_set's l2: 15702.6	valid_set's r2: 0.907324
[4000]	valid_set's l2: 15431.8	valid_set's r2: 0.908914
[5000]	valid_set's l2: 15292.4	valid_set's r2: 0.909747
[6000]	valid_set's l2: 15204.9	valid_set's r2: 0.910248
[7000]	valid_set's l2: 15058.2	valid_set's r2: 0.911112
[8000]	valid_set's l2: 14965.1	valid_set's r2: 0.911676
[9000]	valid_set's l2: 14921.6	valid_set's r2: 0.911924
[10000]	valid_set's l2: 14946.7	valid_set's r2: 0.911773


	0.912	 = Validation score   (r2)
	65.64s	 = Training   runtime
	0.63s	 = Validation runtime
Fitting model: RandomForestMSE ...
	0.8917	 = Validation score   (r2)
	1170.41s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: CatBoost ...
	Training CatBoost with GPU, note that this may negatively impact model quality compared to CPU training.
Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
	0.9002	 = Validation score   (r2)
	121.52s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	0.8857	 = Validation score   (r2)
	1362.84s	 = Training   runtime
	0.31s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.813	 = Validation score   (r2)
	305.75s	 = Training   runtime
	0.4s	 = Validation runtime
Fitting model: XGBoost ...
	0.9128	 = Validation score   (r2)
	107.72s	 = Training   runtime
	0.2s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	0.9017	 = Vali

[1000]	valid_set's l2: 15896.5	valid_set's r2: 0.906164
[2000]	valid_set's l2: 15435.1	valid_set's r2: 0.908894
[3000]	valid_set's l2: 15217	valid_set's r2: 0.910175
[4000]	valid_set's l2: 15055.8	valid_set's r2: 0.91113
[5000]	valid_set's l2: 14934.2	valid_set's r2: 0.91183
[6000]	valid_set's l2: 14923.7	valid_set's r2: 0.911914
[7000]	valid_set's l2: 14923.3	valid_set's r2: 0.911917
[8000]	valid_set's l2: 14945.3	valid_set's r2: 0.911794


	0.912	 = Validation score   (r2)
	91.4s	 = Training   runtime
	0.7s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.922	 = Validation score   (r2)
	0.52s	 = Training   runtime
	0.01s	 = Validation runtime
AutoGluon training complete, total runtime = 3680.51s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("drive/MyDrive/final_outs_agModels-predictClass/")
Evaluation: r2 on test data: 0.8940605097610239
Evaluations on test data:
{
    "r2": 0.8940605097610239,
    "root_mean_squared_error": -138.03176363054516,
    "mean_squared_error": -19052.767770959996,
    "mean_absolute_error": -76.73884599441959,
    "pearsonr": 0.9456626646453093,
    "median_absolute_error": -47.65771484375
}


{'mean_absolute_error': -76.73884599441959,
 'mean_squared_error': -19052.767770959996,
 'median_absolute_error': -47.65771484375,
 'pearsonr': 0.9456626646453093,
 'r2': 0.8940605097610239,
 'root_mean_squared_error': -138.03176363054516}

In [None]:
predictor.leaderboard(test_data) 

                  model  score_test  score_val  pred_time_test  pred_time_val     fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0         LightGBMLarge    0.917282   0.911997       12.817099       0.703684    91.398283                12.817099                0.703684          91.398283            1       True         11
1              LightGBM    0.914956   0.911982       10.995388       0.633142    65.643605                10.995388                0.633142          65.643605            1       True          4
2            LightGBMXT    0.914012   0.909258       10.526795       0.569413    70.058958                10.526795                0.569413          70.058958            1       True          3
3               XGBoost    0.913696   0.912799       13.275803       0.195845   107.715262                13.275803                0.195845         107.715262            1       True          9
4              CatBoost    0.9

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMLarge,0.917282,0.911997,12.817099,0.703684,91.398283,12.817099,0.703684,91.398283,1,True,11
1,LightGBM,0.914956,0.911982,10.995388,0.633142,65.643605,10.995388,0.633142,65.643605,1,True,4
2,LightGBMXT,0.914012,0.909258,10.526795,0.569413,70.058958,10.526795,0.569413,70.058958,1,True,3
3,XGBoost,0.913696,0.912799,13.275803,0.195845,107.715262,13.275803,0.195845,107.715262,1,True,9
4,CatBoost,0.903054,0.90019,0.230289,0.031923,121.524704,0.230289,0.031923,121.524704,1,True,6
5,NeuralNetTorch,0.897914,0.901665,0.610455,0.041447,269.086673,0.610455,0.041447,269.086673,1,True,10
6,RandomForestMSE,0.895143,0.891671,18.467679,0.308758,1170.412494,18.467679,0.308758,1170.412494,1,True,5
7,WeightedEnsemble_L2,0.894061,0.921994,64.549582,2.401174,2010.732422,0.042377,0.008233,0.516144,2,True,12
8,ExtraTreesMSE,0.893139,0.885726,18.585378,0.308856,1362.84148,18.585378,0.308856,1362.84148,1,True,7
9,KNeighborsDist,0.877559,0.869331,4.047233,0.105168,0.205802,4.047233,0.105168,0.205802,1,True,2


## For inference just load the weights(final_outs_agModels-predictClass_with_text drive) and uncomment bellow

In [None]:
# predictor.load('final_outs_agModels-predictClass_with_text drive').leaderboard(test_data)