In [14]:
import pandas as pd
from autoML import AutoML
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    median_absolute_error,
    mean_squared_log_error,
    explained_variance_score
)
import math

def evaluate_regression(y_true, y_pred, dataset_name="Dataset"):
    print(f"\nEvaluation for {dataset_name}:")
    print(f"R2 Score: {r2_score(y_true, y_pred):.4f}")
    print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"Mean Squared Error (MSE): {mean_squared_error(y_true, y_pred):.4f}")
    print(f"Root Mean Squared Error (RMSE): {math.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print(f"Median Absolute Error (MedAE): {median_absolute_error(y_true, y_pred):.4f}")
    try:
        print(f"Mean Squared Log Error (MSLE): {mean_squared_log_error(y_true, y_pred):.4f}")
    except ValueError:
        print("Mean Squared Log Error (MSLE): Not defined for negative values.")
    print(f"Explained Variance Score: {explained_variance_score(y_true, y_pred):.4f}")


data_path = '/data/ephemeral/home/Dongjin/level4-cv-finalproject-hackathon-cv-02-lv3/autoML/melb_split.csv'
drop_tables = ['Address', 'BuildingArea', 'YearBuilt',
               'Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']

# df 불러오기 및 column 제거
df = pd.read_csv(data_path)
df = df.drop(drop_tables, axis=1)
df = df.dropna(axis=0)

# 데이터셋 분리
train_data = df[df['Split'] == 'Train']
test_data = df[df['Split'] == 'Test']

# 타겟 변수와 특성 분리
y_train = train_data['Price']
X_train = train_data.drop(['Price', 'Split'], axis=1)
y_test = test_data['Price']
X_test = test_data.drop(['Price', 'Split'], axis=1)

# 결과 확인
print("X_train.shape, y_train.shape, X_test.shape, y_test.shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)

# na값 통계
print("X_train, y_train, X_test, y_test null")
print(X_train.isnull().sum())
print(y_train.isnull().sum())
print(X_test.isnull().sum())
print(y_test.isnull().sum())

X_train.shape, y_train.shape, X_test.shape, y_test.shape:  (10812, 10) (10812,) (2706, 10) (2706,)
X_train, y_train, X_test, y_test null
Rooms            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
Lattitude        0
Longtitude       0
Propertycount    0
dtype: int64
0
Rooms            0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
Lattitude        0
Longtitude       0
Propertycount    0
dtype: int64
0


In [2]:
autoML = AutoML(n_population=20, n_generation=1, n_parent=5, prob_mutation=0.1)
autoML.fit(X_train, y_train, timeout=3)

1 structure - r2: 0.5908075319344291
Timeout: pipeline.fit did not complete in given time.
3 structure - r2: 0.5740106308828845


  f = msb / msw


Timeout: pipeline.fit did not complete in given time.
Timeout: pipeline.fit did not complete in given time.


  f = msb / msw


Timeout: pipeline.fit did not complete in given time.
Timeout: pipeline.fit did not complete in given time.
8 structure - r2: 0.501730287579911
Timeout: pipeline.fit did not complete in given time.
10 structure - r2: 0.35602712684145843
11 structure - r2: 0.5561603643494661
Timeout: pipeline.fit did not complete in given time.
13 structure - r2: 0.5840725509416411


  f = msb / msw


Timeout: pipeline.fit did not complete in given time.
15 structure - r2: 0.26339389273101477


  f = msb / msw


Timeout: pipeline.fit did not complete in given time.
17 structure - r2: 0.26339553341085364


  f = msb / msw


Timeout: pipeline.fit did not complete in given time.
Timeout: pipeline.fit did not complete in given time.
20 structure - r2: 0.6192286075859508
[2025-01-22 14:11:08] 1 - best R2: 0.619
[2025-01-22 14:11:08] PolynomialFeatures() - SelectPercentile() - KNeighborsRegressor()


  f = msb / msw


In [16]:
y_train_pred = autoML.predict(X_train)
y_test_pred = autoML.predict(X_test)

evaluate_regression(y_train, y_train_pred)
evaluate_regression(y_test, y_test_pred)


Evaluation for Dataset:
R2 Score: 0.7034
Mean Absolute Error (MAE): 215716.3726
Mean Squared Error (MSE): 122284308775.0166
Root Mean Squared Error (RMSE): 349691.7339
Median Absolute Error (MedAE): 134850.0000
Mean Squared Log Error (MSLE): 0.0701
Explained Variance Score: 0.7034

Evaluation for Dataset:
R2 Score: 0.6266
Mean Absolute Error (MAE): 248893.9728
Mean Squared Error (MSE): 148616186567.2364
Root Mean Squared Error (RMSE): 385507.6998
Median Absolute Error (MedAE): 161000.0000
Mean Squared Log Error (MSLE): 0.0932
Explained Variance Score: 0.6270


In [17]:
print(autoML.best_structure)

{'preprocessor': PolynomialFeatures(), 'feature_selection': SelectPercentile(), 'model': KNeighborsRegressor(), 'pipeline': Pipeline(steps=[('preprocessor', PolynomialFeatures()),
                ('feature_selection', SelectPercentile()),
                ('model', KNeighborsRegressor())]), 'valid_metric': {'r2': 0.6192286075859508, 'RMSE': 403705.2690898375}, 'train_metric': {'r2': 0.725398947059162, 'RMSE': 334824.388505672}}


In [8]:
autoML.best_score

0.6192286075859508