# Machine Learning Analysis

In [2]:
import numpy as np
import pandas as pd
import time

from autogluon.tabular import TabularPredictor

from src.config import (
    CLEAN_DATA,
    MODELS_FOLDER,
    KAGGLE_SUBMISSION,
    )

In [3]:
df = pd.read_csv(CLEAN_DATA)

print(df.shape)
df_train = df.loc[~df['SalePrice'].isnull()]
df_test = df.loc[df['SalePrice'].isnull()]

id = df_test['Id']

df_train = df_train.drop('Id', axis=1)
df_test = df_test.drop('Id', axis=1)

(2919, 82)


In [4]:
# Preprocessing: log-transform the target variable
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])  # Log transform target

# Identify the target variable
label = 'SalePrice'

eval_metric = 'rmse'

presets = 'good_quality'
#'medium_quality'
#'good_quality'
#'high_quality'
#'best_quality'

time_limit = 60 * 60 * 0.5


# Initialize the TabularPredictor 
start_time = time.time()

predictor = TabularPredictor(
    label=label, 
    eval_metric=eval_metric,
    path=MODELS_FOLDER,
    problem_type='regression'
).fit(
    train_data=df_train, 
    time_limit=time_limit, 
    presets=presets,
    keep_only_best=True,
    save_space=True
)

# End the timer
end_time = time.time()

# Print the time taken
elapsed_time = end_time - start_time
print(f"Time taken to run the fit method: {elapsed_time:.2f} seconds")

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Wed Dec 11 22:24:04 UTC 2024
CPU Count:          8
Memory Avail:       3.81 GB / 7.56 GB (50.4%)
Disk Space Avail:   1557.68 GB / 1832.70 GB (85.0%)
Presets specified: ['good_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or 

[36m(_ray_fit pid=25558)[0m [1000]	valid_set's rmse: 0.126927


[36m(_dystack pid=25198)[0m 	-0.1239	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=25198)[0m 	3.91s	 = Training   runtime
[36m(_dystack pid=25198)[0m 	0.22s	 = Validation runtime
[36m(_dystack pid=25198)[0m Fitting model: LightGBM_BAG_L1 ... Training model for up to 279.75s of the 423.92s of remaining time.
[36m(_dystack pid=25198)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=0.75%)


[36m(_ray_fit pid=25886)[0m [1000]	valid_set's rmse: 0.133901[32m [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m


[36m(_dystack pid=25198)[0m 	-0.1283	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=25198)[0m 	3.76s	 = Training   runtime
[36m(_dystack pid=25198)[0m 	0.17s	 = Validation runtime
[36m(_dystack pid=25198)[0m Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 273.84s of the 418.01s of remaining time.
[36m(_dystack pid=25198)[0m 	-0.1405	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=25198)[0m 	2.84s	 = Training   runtime
[36m(_dystack pid=25198)[0m 	0.09s	 = Validation runtime
[36m(_dystack pid=25198)[0m Fitting model: CatBoost_BAG_L1 ... Training model for up to 270.88s of the 415.05s of remaining time.
[36m(_dystack pid=25198)[0m 	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=1, gpus=0, memory=1.33%)
[36m(_dystack pid=25198)[0m 	-0.1245	 = Validation score   (-root_mean_squared_error)
[36m(_dystack pid=25198)[0m 	87.45s	 = Training   runtime
[36m(

Time taken to run the fit method: 903.47 seconds


In [None]:
# #To load, use: 
# predictor = TabularPredictor.load("/home/alexhubbe/MEGA/data_science/portfolio/house_prices/models")

In [19]:
predictor.model_best

'WeightedEnsemble_L3_FULL'

In [7]:
# Make predictions on the test set
predictions = predictor.predict(df_test)

# Inverse transform the predictions
predictions = np.expm1(predictions)

# Ensure predictions are non-negative (important after inverse transform)
predictions = np.maximum(0, predictions)

predictions
# Create a submission file (Kaggle format)
submission = pd.DataFrame({'Id': id, 'SalePrice': predictions})

submission.to_csv(KAGGLE_SUBMISSION, index=False)

P.S.: AutoGluon suggests that feature performance should be evaluated on the test dataset, but this is not feasible in this situation.

In [9]:
feature_importance = predictor.feature_importance(df_train, time_limit = time_limit)

Computing feature importance via permutation shuffling for 80 features using 1460 rows with 10 shuffle sets... Time limit: 1800.0s...
	1525.17s	= Expected runtime (152.52s per shuffle set)
	665.9s	= Actual runtime (Completed 10 of 10 shuffle sets)


In [10]:
with pd.option_context('display.max_rows', None):
    print(feature_importance)

                            importance    stddev       p_value   n  p99_high  \
GrLivArea                     0.066878  0.002153  2.979412e-15  10  0.069091   
OverallQual                   0.054311  0.001678  2.059869e-15  10  0.056036   
Median_n_Closest_SalePrice    0.027059  0.000941  5.964460e-15  10  0.028026   
TotalBsmtSF                   0.021760  0.000952  4.702880e-14  10  0.022739   
1stFlrSF                      0.021413  0.000895  3.128684e-14  10  0.022334   
OverallCond                   0.020579  0.000888  4.167715e-14  10  0.021492   
BsmtFinSF1                    0.015972  0.000493  2.044923e-15  10  0.016478   
Neighborhood                  0.012654  0.000394  2.220548e-15  10  0.013060   
2ndFlrSF                      0.011216  0.000632  4.551984e-13  10  0.011866   
LotArea                       0.011131  0.000704  1.285180e-12  10  0.011855   
YearBuilt                     0.010849  0.000386  7.329332e-15  10  0.011245   
YearRemodAdd                  0.009395  