In [None]:
import pandas as pd
import numpy as np
import warnings

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

from src.classes_tgan import TGAN
from src.utils import one_hot_encoding, plotROCCurves

warnings.filterwarnings("ignore")

# Functions

In [None]:
def get_results_tgan(df, categorical, target):
    # Train Test Split
    X_train, X_test = train_test_split(df, random_state=0, test_size=0.25)
    X_test, y_test = X_test.drop(columns=[target]), X_test[target]
    X_test = one_hot_encoding(X_test, categorical)
    
    # Initialize data augmentations
    algo = TGAN(X_train, categorical, target)
    
    # Initialize classification
    algos_classify = [RandomForestClassifier, LogisticRegression, XGBClassifier]
    
    # Baseline SMOTE
    X_train_smote, y_train_smote = SMOTE(k_neighbors=4).fit_resample(one_hot_encoding(X_train.drop(columns=[target]), categorical), 
                                                        X_train[target])
    
    # Baseline NO AUGMENTATION
    X_train_baseline, y_train_baseline = one_hot_encoding(X_train.drop(columns=[target]), categorical), X_train[target]
    
    # Augment
    res = dict()
    algo.fit()
    X_train_augmented = algo.augment()
    X_train_bis = one_hot_encoding(X_train_augmented.drop(columns=[target]), categorical)
    y_train_bis = X_train_augmented[target]

    # Classify
    for algo_classify in algos_classify:

        ### With augmentation
        instance_algo_classify = algo_classify(random_state=0)
        instance_algo_classify.fit(X_train_bis, y_train_bis)
        y_pred_test = instance_algo_classify.predict(X_test)
        perf = f1_score(y_test.astype(str), y_pred_test.astype(str), average="macro")
        algo_augment_name = type(algo).__name__
        algo_classify_name = type(instance_algo_classify).__name__
        try:
            res[algo_augment_name][algo_classify_name] = perf
        except:
            res[algo_augment_name] = dict()
            res[algo_augment_name][algo_classify_name] = perf
        plotROCCurves(y_test, X_test, instance_algo_classify, algo_augment_name)
        del instance_algo_classify

        ### With SMOTE
        instance_algo_classify = algo_classify(random_state=0)
        instance_algo_classify.fit(X_train_smote, y_train_smote)
        y_pred_test_smote = instance_algo_classify.predict(X_test)
        perf = f1_score(y_test.astype(str), y_pred_test_smote.astype(str), average="macro")
        try:
            res["SMOTE"][algo_classify_name] = perf
        except:
            res["SMOTE"] = dict()
            res["SMOTE"][algo_classify_name] = perf
        plotROCCurves(y_test, X_test, instance_algo_classify, "SMOTE")
        del instance_algo_classify

        ### BASELINE
        instance_algo_classify_base = algo_classify(random_state=0)
        instance_algo_classify_base.fit(X_train_baseline, y_train_baseline)
        y_pred_test_baseline = instance_algo_classify_base.predict(X_test)
        perf = f1_score(y_test.astype(str), y_pred_test_baseline.astype(str), average="macro")
        print("baseline perf", perf)
        try:
            res["BASELINE"][algo_classify_name] = perf
        except:
            res["BASELINE"] = dict()
            res["BASELINE"][algo_classify_name] = perf
        del instance_algo_classify_base
                
    return X_train_augmented, X_train_baseline, pd.DataFrame.from_dict(res)

# CMC

In [None]:
df_cmc = pd.read_csv("datasets/low_dimension/cmc.data")
df_cmc.columns = ["wife_age", "wife_education", "husband_education", "children", 
              "wife_religion", "wife_working", "husband_occupation", "living_index", "media", "contraceptive"]
categorical = ["wife_education", "husband_education", "wife_religion", "wife_working", "husband_occupation",
              "living_index", "contraceptive"]

In [None]:
X_aug, X_base, res_cmc = get_results_tgan(df_cmc, categorical, target="contraceptive")

In [None]:
res_cmc

# Yeast

In [None]:
df_yeast = pd.read_csv("datasets/low_dimension/yeast.data")
dic_yeast = {}
for i in range(df_yeast.shape[0]):
    dic_yeast[i] = df_yeast.iloc[i, 0].split()
df_yeast = pd.DataFrame.from_dict(dic_yeast).T
df_yeast.columns = ["sequence_name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "target"]
df_yeast.drop(columns=["sequence_name"], inplace=True)
df_yeast[["mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc"]] = df_yeast[["mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc"]].astype(float)
categorical = []
#Remove ERL
df_yeast = df_yeast[df_yeast.target != "ERL"].copy()

In [None]:
X_aug, X_base, res_yeast = get_results_tgan(df_yeast, categorical, target="target")

In [None]:
res_yeast

# Arrythmeia

In [None]:
df_ary = pd.read_csv("datasets/high_dimension/arrhythmia.data", header=None)
df_ary.rename({279:'target'}, inplace=True, axis=1)
categorical = ["target"]
df_ary = df_ary.replace('?', np.NaN)
df_ary.iloc[:, 10:15] = df_ary.iloc[:, 10:15].astype(float)
df_ary.fillna(df_ary.mean(), inplace=True)
df_ary = df_ary[~df_ary.target.isin([7, 8, 9, 14, 15])]

In [None]:
!rm -rf output
tf.reset_default_graph()

In [None]:
X_aug, X_base, res_yeast = get_results_tgan(df_ary, categorical, target="target")

In [None]:
res_yeast

# Covertype

In [None]:
df_cov = pd.read_csv("datasets/high_dimension/covertype_csv.csv")
categorical = ["class"]
_, df_cov = train_test_split(df_cov, random_state=0, test_size=0.10, stratify=df_cov['class'])

In [None]:
!rm -rf output
tf.reset_default_graph()

In [None]:
X_aug, X_base, res_cov = get_results_tgan(df_cov, categorical, target="class")

In [None]:
res_cov