In [1]:
import numpy as np
import pandas as pd
import time

from autogluon.tabular import TabularPredictor

from src.config import (
    PLUS_LON_LAT_DATA,
    TRAIN_DATA,
    TEST_DATA,
    CLEAN_DATA
    )

In [2]:
df = pd.read_csv('median_150_closest_sale_price.csv')
print(df.shape)
df_train = df.loc[~df['SalePrice'].isnull()]
df_test = df.loc[df['SalePrice'].isnull()]

id = df_test['Id']

df_train = df_train.drop('Id', axis=1)
df_test = df_test.drop('Id', axis=1)

(2919, 79)


In [3]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Alley',
       'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', 

In [None]:
# Preprocessing: log-transform the target variable
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])  # Log transform target

# Identify the target variable
label = 'SalePrice'

eval_metric = 'rmse'

presets = 'medium_quality'
#'medium_quality'
#'good_quality'
#'best_quality'

time_limit = 60 * 60 * 0.5


# Initialize the TabularPredictor 
start_time = time.time()

predictor = TabularPredictor(
    label=label, 
    eval_metric=eval_metric, 
    #problem_type='regression'
).fit(
    train_data=df_train, 
    time_limit=time_limit, 
    presets=presets,
    num_cpus=1 
)

# End the timer
end_time = time.time()

# Print the time taken
elapsed_time = end_time - start_time
print(f"Time taken to run the fit method: {elapsed_time:.2f} seconds")

In [5]:
predictor.leaderboard().head(5)

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.127272,root_mean_squared_error,0.076836,17.836869,0.000368,0.011302,2,True,11
1,LightGBMXT,-0.133216,root_mean_squared_error,0.029184,3.985477,0.029184,3.985477,1,True,3
2,NeuralNetTorch,-0.134965,root_mean_squared_error,0.043051,13.828775,0.043051,13.828775,1,True,9
3,CatBoost,-0.140833,root_mean_squared_error,0.013876,26.283865,0.013876,26.283865,1,True,6
4,LightGBM,-0.142804,root_mean_squared_error,0.019547,0.802744,0.019547,0.802744,1,True,4


In [6]:
# Make predictions on the test set
predictions = predictor.predict(df_test)

# Inverse transform the predictions
predictions = np.expm1(predictions)

# Ensure predictions are non-negative (important after inverse transform)
predictions = np.maximum(0, predictions)

predictions
# Create a submission file (Kaggle format)
submission = pd.DataFrame({'Id': id, 'SalePrice': predictions})
submission.to_csv('pred_150.csv', index=False)
submission

Unnamed: 0,Id,SalePrice
0,2217,73308.812500
1,2905,90000.476562
3,2581,129865.398438
5,1503,281948.250000
6,2718,214939.031250
...,...,...
2907,2095,146810.812500
2910,1811,95666.296875
2911,2091,120604.390625
2914,2101,120075.320312


In [7]:
#feature_importance = predictor.feature_importance(df_train, time_limit = time_limit)

In [8]:
# with pd.option_context('display.max_rows', None):
#     print(feature_importance)


In [9]:
# feature_importance.to_csv('feat_imp_3_nan_treated_plus_neighborhoodgroup_season_outliers_removed.csv')

# Deploy

In [None]:
# study_name = str(DATA_FOLDER/"logistic_regression_round1")
# storage_name = f"sqlite:///{study_name}.db"
# study = optuna.create_study(study_name=study_name, storage=storage_name, load_if_exists=True)

In [None]:
# X_train = X.iloc[train_indices]
# y_train = y[train_indices]

# preprocessor = preprocessor_ohe_quantile

# best_trial = study.best_trial
# best_params = best_trial.params
# model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=1000, **best_params)

# # Create pipeline
# pipeline_model = Pipeline([("preprocessor", preprocessor), ("model", model)])

# pipeline_model.fit(X_train,y_train)

# joblib.dump(pipeline_model, DEPLOYED_MODEL)