In [10]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

from supervised.automl import AutoML

import warnings
warnings.filterwarnings("ignore")

In [11]:
df_init = pd.read_csv('data_v3.csv', sep=';')
df_init.head()

Unnamed: 0,Id,Number,Result,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_106,...,Feature_99,Feature_100,Feature_111,Feature_1122,Feature_112,Feature_113,Feature_114,Feature_115,Feature_116,Feature_117
0,1,10359/C2020,2,2,56,12,1,7.0,4,2,...,0.0,0.0,1,0,1,1,1.0,1,1,0
1,2,10346/C2020,2,2,69,19,1,6.0,4,2,...,0.0,0.0,1,0,1,1,1.0,0,1,1
2,3,10311/C2020,2,1,66,8,1,4.0,4,2,...,0.0,0.0,1,0,1,1,1.0,0,1,0
3,4,10292/C2020,2,2,62,16,1,,3,2,...,0.0,0.0,1,0,1,1,0.0,0,1,1
4,5,10283/C2020,2,2,67,30,1,,4,2,...,0.0,0.0,1,0,1,1,1.0,0,1,0


In [12]:
target_feature = 'Result'

useless_features = ['Id', 'Number', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_17', 'Feature_18', 'Feature_23', 
                    'Feature_29', 'Feature_30', 'Feature_32','Feature_33', 'Feature_34',
                    'Feature_35', 'Feature_54', 'Feature_65', 'Feature_94', 'Feature_101', 'Feature_108','Feature_111', 
                    'Feature_112','Feature_113', 'Feature_114','Feature_115', 'Feature_116', 'Feature_117']
features_to_drop = [target_feature] + useless_features
features = df_init.columns.drop(features_to_drop).tolist()

# features = ['Feature_47', 'Feature_106', 'Feature_41', 'Feature_2', 'Feature_40', 'Feature_9', 'Feature_73', 'Feature_51', 'Feature_31', 'Feature_48', 'Feature_71', 'Feature_70', 'Feature_59', 'Feature_39', 'Feature_84']

df_ = df_init[[target_feature] + features]

num_features = ['Feature_2', 'Feature_3', 'Feature_33', 'Feature_34', 'Feature_36', 'Feature_37', 'Feature_38', 'Feature_39', 'Feature_40', 
                'Feature_41', 'Feature_42', 'Feature_43', 'Feature_44', 'Feature_45', 'Feature_46', 'Feature_47', 
                'Feature_48', 'Feature_49', 'Feature_50', 'Feature_51', 'Feature_53', 'Feature_55', 'Feature_57', 
                'Feature_58', 'Feature_59', 'Feature_64', 'Feature_70', 'Feature_71', 'Feature_72', 'Feature_73']

special_features = ['Feature_17', 'Feature_18', 'Feature_23', 'Feature_74', 'Feature_75', 'Feature_76', 'Feature_77', 
                    'Feature_78', 'Feature_79', 'Feature_80', 'Feature_81', 'Feature_82', 'Feature_83', 'Feature_84', 
                    'Feature_85', 'Feature_86', 'Feature_87', 'Feature_88', 'Feature_89', 'Feature_90', 'Feature_91', 
                    'Feature_92', 'Feature_93', 'Feature_94', 'Feature_95', 'Feature_96', 'Feature_97', 'Feature_98', 
                    'Feature_99', 'Feature_100']

for col in special_features:
    if col in df_.columns:
        df_[col].fillna(0, inplace=True)

cat_features = []
for col in df_.drop(target_feature, axis=1).columns:
    if col in num_features:
        df_[col].fillna(df_[col].median(), inplace=True)
        df_[col] = df_[col].astype('float64')
    else:
        cat_features.append(col)
        df_[col].fillna(-1, inplace=True)
        df_[col] = df_[col].astype('int64')

# Result=0 - alive, Result=1 died
df_.loc[df_[target_feature] == 1, target_feature] = 0
df_.loc[df_[target_feature] == 2, target_feature] = 1

df_train, df_test = train_test_split(
    df_,
    shuffle=True,
    test_size=0.25,
    random_state=0,
    stratify=df_[target_feature],
)

# Separate majority and minority classes
df_majority = df_train[df_train[target_feature] == 1]
df_minority = df_train[df_train[target_feature] == 0]

# Upsample majority class
df_majority_upsampled = resample(
    df_majority,
    replace=True,
    n_samples=100,
    random_state=0,
)
# Upsample minority class
df_minority_upsampled = resample(
    df_minority,
    replace=True,
    n_samples=100,
    random_state=0,
)
# Combine minority class with downsampled majority class
df_train_sampled = pd.concat([df_majority_upsampled, df_minority_upsampled])

X_train = df_train_sampled[features]
y_train = df_train_sampled[target_feature]

X_test = df_test[features]
y_test = df_test[target_feature]

automl = AutoML(
    results_path='reports/automl-ver_2/',
    mode='Perform',
    ml_task='binary_classification',
    algorithms=['Xgboost'],
)
automl.fit(X_train, y_train)

AutoML directory: reports/automl-ver_2/
The task is binary_classification with evaluation metric logloss
AutoML will use algorithms: ['Xgboost']
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 1 model
1_Default_Xgboost logloss 0.332851 trained in 25.04 seconds (1-sample predict time 0.204 seconds)
* Step not_so_random will try to check up to 4 models
2_Xgboost logloss 0.448854 trained in 40.06 seconds (1-sample predict time 0.402 seconds)
3_Xgboost logloss 0.331957 trained in 34.99 seconds (1-sample predict time 0.492 seconds)
4_Xgboost logloss 0.693147 trained in 21.81 seconds (1-sample predict time 0.38 seconds)
5_Xgboost logloss 0.693147 trained in 22.17 seconds (1-sample predict time 0.3231 secon

AutoML(algorithms=['Xgboost'], ml_task='binary_classification', mode='Perform',
       results_path='reports/automl-ver_2/')

In [13]:
from sklearn.metrics import classification_report, f1_score, accuracy_score

y_test_pred = automl.predict(X_test)

print(f'Accuracy: {round(accuracy_score(y_test, y_test_pred), 2)}')
print(f'F1 Score: {round(f1_score(y_test, y_test_pred), 2)}')
print(f'\nClassification report: \n{classification_report(y_test, y_test_pred)}')

Accuracy: 0.7
F1 Score: 0.69

Classification report: 
              precision    recall  f1-score   support

           0       0.70      0.74      0.72        31
           1       0.71      0.67      0.69        30

    accuracy                           0.70        61
   macro avg       0.71      0.70      0.70        61
weighted avg       0.71      0.70      0.70        61

