In [1]:
import numpy as np
import pandas as pd
import time

from autogluon.tabular import TabularPredictor

from src.config import (
    CLEAN_DATA,
    MODELS_FOLDER,
    KAGGLE_SUBMISSION
    )

In [2]:
#df = pd.read_csv(CLEAN_DATA)
df = pd.read_csv(CLEAN_DATA)
print(df.shape)
df_train = df.loc[~df['SalePrice'].isnull()]
df_test = df.loc[df['SalePrice'].isnull()]

id = df_test['Id']

df_train = df_train.drop('Id', axis=1)
df_test = df_test.drop('Id', axis=1)

(2919, 79)


In [None]:
# Preprocessing: log-transform the target variable
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])  # Log transform target

# Identify the target variable
label = 'SalePrice'

eval_metric = 'rmse'

presets = 'good_quality'
#'medium_quality'
#'good_quality'
#'best_quality'

time_limit = 60 * 60 * 0.5


# Initialize the TabularPredictor 
start_time = time.time()

predictor = TabularPredictor(
    label=label, 
    eval_metric=eval_metric,
    path=MODELS_FOLDER
    #problem_type='regression'
).fit(
    train_data=df_train, 
    time_limit=time_limit, 
    presets=presets,
    #num_cpus=1 
)

# End the timer
end_time = time.time()

# Print the time taken
elapsed_time = end_time - start_time
print(f"Time taken to run the fit method: {elapsed_time:.2f} seconds")

In [4]:
predictor.leaderboard().head()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-0.115298,root_mean_squared_error,3.212488,384.397873,0.000299,0.013353,3,False,20
1,WeightedEnsemble_L2,-0.11581,root_mean_squared_error,1.717484,305.657179,0.000488,0.016258,2,False,10
2,ExtraTreesMSE_BAG_L2,-0.118121,root_mean_squared_error,2.178495,315.973169,0.103841,0.906463,2,False,15
3,CatBoost_BAG_L2,-0.118625,root_mean_squared_error,2.251878,348.626597,0.177223,33.559891,2,False,14
4,RandomForestMSE_BAG_L2,-0.119544,root_mean_squared_error,2.17548,317.239848,0.100825,2.173142,2,False,13


In [7]:
# Make predictions on the test set
predictions = predictor.predict(df_test)

# Inverse transform the predictions
predictions = np.expm1(predictions)

# Ensure predictions are non-negative (important after inverse transform)
predictions = np.maximum(0, predictions)

predictions
# Create a submission file (Kaggle format)
submission = pd.DataFrame({'Id': id, 'SalePrice': predictions})
submission.to_csv(KAGGLE_SUBMISSION, index=False)

In [24]:
feature_importance = predictor.feature_importance(df_train, time_limit = time_limit)

Computing feature importance via permutation shuffling for 77 features using 1460 rows with 10 shuffle sets... Time limit: 1800.0s...
	593.1s	= Expected runtime (59.31s per shuffle set)
	358.93s	= Actual runtime (Completed 10 of 10 shuffle sets)


In [12]:
with pd.option_context('display.max_rows', None):
    print(feature_importance)

                            importance    stddev       p_value   n  p99_high  \
GrLivArea                     0.075621  0.001745  1.487136e-16  10  0.077414   
OverallQual                   0.063569  0.001911  1.611185e-15  10  0.065534   
TotalBsmtSF                   0.024921  0.001288  2.093243e-13  10  0.026245   
OverallCond                   0.024188  0.001336  3.811178e-13  10  0.025562   
1stFlrSF                      0.021432  0.001060  1.407991e-13  10  0.022521   
YearBuilt                     0.017512  0.000617  6.746361e-15  10  0.018146   
Neighborhood                  0.016563  0.000578  6.173498e-15  10  0.017157   
BsmtFinSF1                    0.015348  0.001085  3.472932e-12  10  0.016463   
LotArea                       0.014749  0.000950  1.508447e-12  10  0.015725   
GarageCars                    0.011120  0.000452  2.402704e-14  10  0.011584   
2ndFlrSF                      0.010885  0.000630  5.762877e-13  10  0.011532   
SaleCondition                 0.008641  