In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
train_set = pd.read_csv('./Datasets/train_cleaned_110523.csv')

# Create a categorical variable for stratification by dividing the 'monthly_rent' into quartiles
train_set['rent_quartile'] = pd.qcut(train_set['monthly_rent'], q=4, labels=False)

# Define the features and target variable for modeling
X = train_set.drop(['monthly_rent', 'rent_quartile'],
                   axis=1)  # Drop 'rent_quartile' here to avoid including it in the features
y = train_set['monthly_rent']

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.80,
    test_size=0.20,
    stratify=train_set['rent_quartile']
)

# Check if the stratification is as expected
y_train.head(), y_test.head()

(26117    1800
 13197    1700
 15160    3150
 6057     2100
 53323    2800
 Name: monthly_rent, dtype: int64,
 5113     3150
 51038    2600
 37451    3900
 58678    2750
 40762    1600
 Name: monthly_rent, dtype: int64)

In [8]:
# Check the distribution of 'monthly_rent' in the full dataset
full_quartile_distribution = train_set['rent_quartile'].value_counts(normalize=True).sort_index()

# Check the distribution of 'monthly_rent' in the training set
train_quartile_distribution = y_train.to_frame().join(train_set['rent_quartile'], how='left')[
    'rent_quartile'].value_counts(normalize=True).sort_index()

# Check the distribution of 'monthly_rent' in the testing set
test_quartile_distribution = y_test.to_frame().join(train_set['rent_quartile'], how='left')[
    'rent_quartile'].value_counts(normalize=True).sort_index()

# Compile the distributions into a DataFrame for comparison
quartile_distributions_comparison = pd.DataFrame({
    'Full Data': full_quartile_distribution,
    'Training Set': train_quartile_distribution,
    'Testing Set': test_quartile_distribution
})

quartile_distributions_comparison

Unnamed: 0_level_0,Full Data,Training Set,Testing Set
rent_quartile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.317633,0.317625,0.317667
1,0.185767,0.185771,0.18575
2,0.246783,0.246792,0.24675
3,0.249817,0.249812,0.249833


In [3]:
from tpot import TPOTRegressor
# 0.75/0.25 split
# Instantiate and train a TPOT auto-ML regressor
tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)

# Print the best pipeline found by TPOT
print(tpot.fitted_pipeline_)

# Evaluate the final model on the test data
print(tpot.score(X_test, y_test))

# Export the pipeline to a Python file for future use
tpot.export('best_pipeline_110523.py')

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -241150.33359955723

Generation 2 - Current best internal CV score: -241150.33359955723

Generation 3 - Current best internal CV score: -240638.72273303958

Generation 4 - Current best internal CV score: -240628.86399177145

Generation 5 - Current best internal CV score: -240613.66858386635

Best pipeline: RandomForestRegressor(ZeroCount(input_matrix), bootstrap=True, max_features=0.4, min_samples_leaf=16, min_samples_split=16, n_estimators=100)
Pipeline(steps=[('zerocount', ZeroCount()),
                ('randomforestregressor',
                 RandomForestRegressor(max_features=0.4, min_samples_leaf=16,
                                       min_samples_split=16,
                                       random_state=42))])
-243593.78255556637


In [9]:
from tpot import TPOTRegressor
from numpy import sqrt

config_dict = {
    'xgboost.XGBRegressor': {
        'n_estimators': [400, 500, 600, 700, 1000],
        'max_depth': [4, 5, 6, 9],
        'learning_rate': [0.01, 0.015, 0.02, 0.025, 0.05, 0.1, 0.5, 1],
        'subsample': [0.8, 0.9, 1.0, 1.1, 1.2, 1.3,1.5],
        'min_child_weight': [4, 5, 6, 7, 8],
        'n_jobs': [1]  # TPOT is already parallelized, so set n_jobs to 1 for XGBoost
    }
}
# trying xgbbooster and 0.8/0.2 split
# Instantiate and train a TPOT auto-ML regressor
tpot = TPOTRegressor(generations=20,
                     population_size=30,
                     verbosity=2,
                     random_state=42,
                     scoring='neg_mean_squared_error',
                     config_dict=config_dict,  # Custom configuration
                     warm_start=True
                     )
tpot.fit(X_train, y_train)

# Print the best pipeline found by TPOT
print(tpot.fitted_pipeline_)

# Score on the test set using RMSE
neg_mse = tpot.score(X_test, y_test)
rmse = sqrt(-neg_mse)
print("Test RMSE: ", rmse)

# Export the pipeline to a Python file for future use
tpot.export('best_pipeline_110523_8020split.py')

Optimization Progress:   0%|          | 0/630 [00:00<?, ?pipeline/s]



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.


RuntimeError: There was an error in the TPOT optimization process. This could be because the data was not formatted properly, or because data for a regression problem was provided to the TPOTClassifier object. Please make sure you passed the data to TPOT correctly. If you enabled PyTorch estimators, please check the data requirements in the online documentation: https://epistasislab.github.io/tpot/using/

In [12]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM Zulu21.30+15-CA (build 21.0.1+12-LTS, mixed mode, sharing)
  Starting server from C:\Users\User\.conda\envs\project-tpot-38\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\User\AppData\Local\Temp\tmpux33c3nw
  JVM stdout: C:\Users\User\AppData\Local\Temp\tmpux33c3nw\h2o_User_started_from_python.out
  JVM stderr: C:\Users\User\AppData\Local\Temp\tmpux33c3nw\h2o_User_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Asia/Singapore
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_User_71hfe8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.968 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [20]:
data = h2o.import_file('./Datasets/train_cleaned_110523.csv')
train, test = data.split_frame(ratios=[0.85], seed = 1)
# drop target monthly_rent column and get all features
x = data.columns
y = 'monthly_rent'
x.remove(y)
AutoML = H2OAutoML(max_models=100, seed =10, max_runtime_secs=25000, project_name="Revenue_Forecast_for_Rental_Pricing") # 32000 9hrs # 42500 12hrs

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [21]:
AutoML.train(x=x, y=y, training_frame=train)

AutoML progress: |
16:48:30.406: AutoML: XGBoost is not available; skipping it.
███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),24/100
# GBM base models (used / total),13/42
# DeepLearning base models (used / total),11/55
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,367.21735,1.399003,365.81006,369.41855,367.65393,366.4356,366.76855
mean_residual_deviance,235217.84,1615.1431,236353.45,237208.77,234894.33,234574.08,233058.61
mse,235217.84,1615.1431,236353.45,237208.77,234894.33,234574.08,233058.61
null_deviance,5218217000.0,36084884.0,5272670000.0,5183008000.0,5207015000.0,5234761700.0,5193631000.0
r2,0.5407048,0.0048989,0.5418768,0.5354736,0.5356708,0.5457871,0.5447155
residual_deviance,2396456700.0,23987350.0,2415532300.0,2407431700.0,2417297700.0,2377642800.0,2364379600.0
rmse,484.99033,1.6653086,486.16196,487.04083,484.659,484.3285,482.76144
rmsle,0.1884683,0.0018919,0.1899653,0.1893958,0.1890322,0.1851806,0.1887674


In [34]:
# View the AutoML Leaderboard
automl_leaderboard = AutoML.leaderboard
automl_leaderboard.head(rows=automl_leaderboard.nrows)

model_id,rmse,mse,mae,rmsle,mean_residual_deviance
StackedEnsemble_AllModels_1_AutoML_1_20231105_164830,484.994,235219,367.217,0.18848,235219
StackedEnsemble_BestOfFamily_1_AutoML_1_20231105_164830,486.584,236764,368.531,0.189123,236764
GBM_grid_1_AutoML_1_20231105_164830_model_20,486.971,237141,369.279,0.189196,237141
GBM_grid_1_AutoML_1_20231105_164830_model_33,487.025,237193,369.442,0.189136,237193
GBM_grid_1_AutoML_1_20231105_164830_model_32,487.35,237510,368.991,0.18936,237510
GBM_grid_1_AutoML_1_20231105_164830_model_3,487.542,237697,369.775,0.189339,237697
GBM_5_AutoML_1_20231105_164830,487.768,237917,369.37,0.189443,237917
GBM_2_AutoML_1_20231105_164830,487.979,238123,369.164,0.189529,238123
GBM_grid_1_AutoML_1_20231105_164830_model_12,487.993,238137,369.068,0.189447,238137
GBM_grid_1_AutoML_1_20231105_164830_model_37,488.315,238451,370.68,0.189701,238451


In [35]:
best_model = AutoML.get_best_model()
print(best_model)

Model Details
H2OStackedEnsembleEstimator : Stacked Ensemble
Model Key: StackedEnsemble_AllModels_1_AutoML_1_20231105_164830


Model Summary for Stacked Ensemble: 
key                                        value
-----------------------------------------  ----------------
Stacking strategy                          cross_validation
Number of base models (used / total)       24/100
# GBM base models (used / total)           13/42
# DeepLearning base models (used / total)  11/55
# DRF base models (used / total)           0/2
# GLM base models (used / total)           0/1
Metalearner algorithm                      GLM
Metalearner fold assignment scheme         Random
Metalearner nfolds                         5
Metalearner fold_column
Custom metalearner hyperparameters         None

ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 205595.47205154574
RMSE: 453.42636894157994
MAE: 341.82591003370726
RMSLE: 0.17696717348938112
Mean Residual Deviance: 205595.47205

In [29]:
test_data = h2o.import_file('./Datasets/test_cleaned_110523.csv')
output = AutoML.predict(test_data)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [32]:
output.as_data_frame()

Unnamed: 0,predict
0,3288.541880
1,2712.107961
2,3663.413549
3,1946.566404
4,2581.616435
...,...
29995,2765.614707
29996,2843.007218
29997,2680.420784
29998,3271.409759


In [33]:
output.as_data_frame().to_csv('./Datasets/results/test_cleaned_110523_automl.csv')