## Data

In [1]:
from libs.paths import PATHS
from libs.pre_processing.training_data import read_data, train_test_split
from libs.pre_processing.pipeline import make_pipeline

Current directory: c:\Users\Alvaro Lima\My Files\Projects\automl-tcc\src
Source path found in current directory.


In [2]:
# Oversampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(
    sampling_strategy=1,  
    k_neighbors=5,             
    random_state=42            
)

In [3]:
# Raw data
X, y = read_data(PATHS["ADULT_INCOME"], "income")

# map target variable to binary
y = y.map({"<=50K": 0, ">50K": 1})

X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.7, stratify=y, random_state=42)

# Pre-processed data
pipeline = make_pipeline(X_train)
X_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Balanced data
X_balanced, y_balanced = smote.fit_resample(X_transformed, y_train)

In [4]:
import numpy as np

print(X_train.shape)
print(y_train.shape)
print(np.unique(y_train, return_counts=True))

(34189, 14)
(34189,)
(array([0, 1], dtype=int64), array([26008,  8181], dtype=int64))


## TPOT

### Results

In [None]:
from tpot import TPOTEstimator

# TPOT configuration
# Search spaces:
# - "linear": A linear pipeline
# - "linear-light": A linear pipeline with no inner classifiers
# - "graph": A graph-based pipeline
# - "graph-light": A graph-based pipeline with no inner classifiers
# - "mdr": A multifactor dimensionality reduction pipeline (used for genetic data)

args = {
    "search_space":"linear-light",
    "generations":10, # Number of generations to run
    "population_size":30, # Number of individuals in each generation (genetic alg population size)
    "max_time_mins":None,
    "scorers":["roc_auc"],
    "scorers_weights":[1],
    "classification":True,
    "cv":5, # Number of cross-validation folds
    "random_state":42, 
    "verbose":3,
    "n_jobs":4,
}


In [6]:
from sklearn.metrics import classification_report, roc_auc_score

class ModelResults:
    automl: TPOTEstimator
    report: str
    roc_auc: float
    y_pred_proba: list

    def __init__(self, automl, report, roc_auc, y_pred_proba):
        self.automl = automl
        self.report = report
        self.roc_auc = roc_auc
        self.y_pred_proba = y_pred_proba

def run_automl(X, y, X_test):
    automl = TPOTEstimator(**args)
    automl.fit(X, y)

    y_pred = automl.predict(X_test)
    y_pred_proba = automl.predict_proba(X_test)
    
    report = classification_report(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    
    return ModelResults(automl, report, roc_auc, y_pred_proba)

In [21]:
# Pre-processed data results
pp_results = run_automl(X_transformed, y_train, X_test_transformed)

Generation:  10%|█         | 1/10 [14:21<2:09:13, 861.46s/it]

Generation:  1
Best roc_auc_score score: 0.8943152856470888


Generation:  20%|██        | 2/10 [25:09<1:38:06, 735.76s/it]

Generation:  2
Best roc_auc_score score: 0.907914745892269


Generation:  30%|███       | 3/10 [36:59<1:24:27, 723.90s/it]

Generation:  3
Best roc_auc_score score: 0.9083487018374017


Generation:  40%|████      | 4/10 [53:28<1:22:52, 828.72s/it]

Generation:  4
Best roc_auc_score score: 0.9083487018374017


Generation:  50%|█████     | 5/10 [1:13:54<1:20:59, 971.92s/it]

Generation:  5
Best roc_auc_score score: 0.908365128859512


Generation:  60%|██████    | 6/10 [1:33:49<1:09:50, 1047.69s/it]

Generation:  6
Best roc_auc_score score: 0.908365128859512


Generation:  70%|███████   | 7/10 [1:57:28<58:27, 1169.10s/it]  

Generation:  7
Best roc_auc_score score: 0.908365128859512


Generation:  80%|████████  | 8/10 [2:20:24<41:10, 1235.17s/it]

Generation:  8
Best roc_auc_score score: 0.908365128859512


Generation:  90%|█████████ | 9/10 [2:40:04<20:17, 1217.75s/it]

Generation:  9
Best roc_auc_score score: 0.908365128859512


Generation: 100%|██████████| 10/10 [3:00:26<00:00, 1082.61s/it]

Generation:  10
Best roc_auc_score score: 0.908365128859512





In [22]:
print("Raw Data Results:")
print(pp_results.report)
print("Best pipeline:")
print(pp_results.automl.fitted_pipeline_)
print(f"ROC AUC: {pp_results.roc_auc:.4f}")

Raw Data Results:
              precision    recall  f1-score   support

           0       0.94      0.80      0.87     11147
           1       0.57      0.85      0.68      3506

    accuracy                           0.81     14653
   macro avg       0.76      0.82      0.77     14653
weighted avg       0.85      0.81      0.82     14653

Best pipeline:
Pipeline(steps=[('normalizer', Normalizer(norm='l1')),
                ('passthrough', Passthrough()),
                ('featureunion-1',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
                                                ('passthrough',
                                                 Passthrough())])),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
                                                ('passthrough

In [7]:
# Balanced data results
balanced_results = run_automl(X_balanced, y_balanced, X_test_transformed)

Generation:  10%|█         | 1/10 [19:00<2:51:08, 1140.93s/it]

Generation:  1
Best roc_auc_score score: 0.88474222948451


Generation:  20%|██        | 2/10 [33:03<2:08:41, 965.17s/it] 

Generation:  2
Best roc_auc_score score: 0.9283360416013515


Generation:  30%|███       | 3/10 [47:06<1:46:07, 909.66s/it]

Generation:  3
Best roc_auc_score score: 0.9283360416013515


Generation:  40%|████      | 4/10 [1:17:34<2:07:13, 1272.23s/it]

Generation:  4
Best roc_auc_score score: 0.9289301703449203


Generation:  50%|█████     | 5/10 [1:42:03<1:51:55, 1343.08s/it]

Generation:  5
Best roc_auc_score score: 0.9289301703449203


Generation:  60%|██████    | 6/10 [2:04:46<1:29:59, 1349.94s/it]

Generation:  6
Best roc_auc_score score: 0.9289301703449203


Generation:  70%|███████   | 7/10 [2:28:49<1:09:00, 1380.15s/it]

Generation:  7
Best roc_auc_score score: 0.9343944708266718


Generation:  80%|████████  | 8/10 [2:44:26<41:18, 1239.16s/it]  

Generation:  8
Best roc_auc_score score: 0.9343944708266718


Generation:  90%|█████████ | 9/10 [3:01:07<19:24, 1164.74s/it]

Generation:  9
Best roc_auc_score score: 0.9440661315642437


Generation: 100%|██████████| 10/10 [3:29:30<00:00, 1257.02s/it]

Generation:  10
Best roc_auc_score score: 0.9440661315642437





In [8]:
print("Raw Data Results:")
print(balanced_results.report)
print("Best pipeline:")
print(balanced_results.automl.fitted_pipeline_)
print(f"ROC AUC: {balanced_results.roc_auc:.4f}")

Raw Data Results:
              precision    recall  f1-score   support

           0       0.92      0.82      0.87     11147
           1       0.57      0.77      0.66      3506

    accuracy                           0.81     14653
   macro avg       0.75      0.79      0.76     14653
weighted avg       0.84      0.81      0.82     14653

Best pipeline:
Pipeline(steps=[('normalizer', Normalizer(norm='l1')),
                ('variancethreshold',
                 VarianceThreshold(threshold=0.0003889506475)),
                ('featureunion-1',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
                                                ('passthrough',
                                                 Passthrough())])),
                ('featureunion-2',
                 FeatureUnion(transformer_list=[('skiptransformer',
                                                 SkipTransformer()),
       

### Curves

In [9]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Plotando a curva ROC
fpr2, tpr2, thresholds2 = roc_curve(y_test, pp_results.y_pred_proba[:, 1])
fpr3, tpr3, thresholds3 = roc_curve(y_test, balanced_results.y_pred_proba[:, 1])

plt.figure()
plt.plot(fpr2, tpr2, label=f'ROC (pre-processed data) (area = {pp_results.roc_auc:.2f})')
plt.plot(fpr3, tpr3, label=f'ROC (balanced data) (area = {balanced_results.roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--') 
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

NameError: name 'pp_results' is not defined