In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import pandas as pd

import sys
sys.path.append('..')

from helpers import utils, pipelines, models

from sklearn.model_selection import train_test_split

import multiprocessing

from data_augmentaion.data_augmentator import DataAugmentor 

import json

import time

### Define research parameters

In [14]:
regression_task = False
continuous_features = ['age', 'fnlwgt', 'hours-per-week']
metric = 'f1'
test_size_proportion = 0.2
augment_sample = 0.5
sample_frac = 0.2

search_pipelines = pipelines.get_classification_pipelines()
search_parameters = models.parameters

settings = [
    {'method': 'random'},
    {'method': 'smote'},
    {'method': 'cf_random'},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 0.2, 'diversity_weight': 5, 'sparsity_weight': 0.2}},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 0.2, 'diversity_weight': 5, 'sparsity_weight': 1}},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 5, 'diversity_weight': 0.2, 'sparsity_weight': 0.2}},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 5, 'diversity_weight': 0.2, 'sparsity_weight': 1}},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 1, 'diversity_weight': 1, 'sparsity_weight': 0.2}},
    {'method': 'cf_genetic', 'kw_args': {'proximity_weight': 1, 'diversity_weight': 1, 'sparsity_weight': 1}},
    {'method': 'cf_kdtree', 'kw_args': {'sparsity_weight': 0.2}},
    {'method': 'cf_kdtree', 'kw_args': {'sparsity_weight': 1}},
]

AttributeError: module 'helpers.pipelines' has no attribute 'get_adult_pipelines'

### Read and preprocess data

In [None]:
raw_data = pd.read_csv("../datasets/adult.csv")
df = utils.preprocess_adult(raw_data)
target = 'income'

#### split data

In [None]:
# all data
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size_proportion, random_state=42)

# sampled data
df_sample = df.sample(frac=sample_frac, random_state=42)
X_sample = df_sample.drop(target, axis=1)
y_sample = df_sample[target]
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(X_sample, y_sample, test_size=test_size_proportion, random_state=42)

In [None]:
display(y_train.value_counts())
display(y_sample_train.value_counts())
X_sample_test.shape

0    27131
1     9009
Name: income, dtype: int64

0    5486
1    1742
Name: income, dtype: int64

(1807, 11)

In [None]:
full_results_df = pd.DataFrame()

### Whole dataset scores

In [None]:
print(f'whole adult dataset {metric} scores:')
_, whole_scores = utils.fit_and_evaluate(X_train, y_train, X_test, y_test,
                    search_estimators=search_pipelines, search_params=search_parameters, scoring=metric)
whole_dataset_result_df = pd.DataFrame.from_dict(whole_scores, orient='index', columns=[f'full data {metric} score'])
display(whole_dataset_result_df.T)

whole adult dataset f1 scores:


Unnamed: 0,lg,rf,xgb
full data f1 score,0.61143,0.557895,0.633659


### Sampled adult dataset scores

In [None]:
print(f'{sample_frac*100}% sampled adult dataset {metric} scores:')
_, sampled_scores = utils.fit_and_evaluate(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
                                           search_estimators=search_pipelines, search_params=search_parameters, scoring=metric)
sampled_dataset_result_df = pd.DataFrame.from_dict(sampled_scores, orient='index', columns=[f'sampled data {metric} score'])
sampled_dataset_result_df.T

20.0% sampled adult dataset f1 scores:


Unnamed: 0,lg,rf,xgb
sampled data f1 score,0.63615,0.527066,0.627635


In [None]:
full_results_df = pd.concat([full_results_df, whole_dataset_result_df, sampled_dataset_result_df], axis=1)

### running experiments

In [None]:
best_estimators = {}
best_scores = {}
for i, s in enumerate(settings):
    start = time.time()
    print(f'{i} / {len(settings)}, {s}', end=' ')
    augmentor = DataAugmentor(X_sample_train, y_sample_train, X_sample_test, y_sample_test,
                             method=s['method'], regression=regression_task,
                             continuous_feats=continuous_features,
                             kw_args=s.get('kw_args', {})
                             )

    X_train_augmented_balanced, y_train_augmented_balanced = augmentor.augment(balance=True)
    best_estimators[f'{i}_balanced'], best_scores[f'{i}_balanced'] = \
    utils.fit_and_evaluate(X_train_augmented_balanced, y_train_augmented_balanced, X_sample_test, y_sample_test,
                            search_estimators=search_pipelines, search_params=search_parameters, scoring=metric)
    result_df_balanced = pd.DataFrame.from_dict(best_scores[f'{i}_balanced'],
                                                orient='index',
                                                columns=[f'{json.dumps((list(s.values())))} balanced {metric} score'])
    full_results_df = pd.concat([full_results_df, result_df_balanced], axis=1)

    X_train_augmented, y_train_augmented = augmentor.augment(balance=False, size=augment_sample)
    best_estimators[f'{i}'], best_scores[f'{i}'] = \
        utils.fit_and_evaluate(X_train_augmented, y_train_augmented, X_sample_test, y_sample_test,
                               search_estimators=search_pipelines, search_params=search_parameters, scoring=metric)
    result_df = pd.DataFrame.from_dict(best_scores[f'{i}'],
                                       orient='index',
                                       columns=[f'{json.dumps((list(s.values())))} {metric} score'])
    
    full_results_df = pd.concat([full_results_df, result_df], axis=1)
    print(f'{time.time() - start} seconds for settings {i}')


0 / 11, {'method': 'random'} categorical_feats: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
35.999945402145386 seconds for settings 0
1 / 11, {'method': 'smote'} categorical_feats: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
46.89471411705017 seconds for settings 1
2 / 11, {'method': 'cf_random'} categorical_feats: ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']


KeyboardInterrupt: 

In [None]:
full_results_df

In [None]:
full_results_df.to_csv(rf'log/experiment_classification_adult_{metric}_testsize{test_size_proportion}_augmentsample{augment_sample}.csv')