# Test synthetic data and choose best model

## Reading and preparing data

In [1]:
#general
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, plot_roc_curve, auc
import matplotlib.pyplot as plt
#preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
def model_name_from_file(file_name: str) -> str:
    """
    returns model name parset from file name
    file_name: str. Template is df_synth_XXX.csv, where XXX - model name    
    """
    return file_name.\
                split('.')[0].\
                split('_')[-1]

In [3]:
def read_csv_to_dict(file_list: list) -> dict:
    """
    reads csv files to dict. Model name is key, dataframe is value.        
    """
    data = dict()
    for file_name in file_list:
        model_name = model_name_from_file(file_name)
        df = pd.read_csv(file_name, index_col=0)
        data[model_name] = df
    return data   

In [4]:
# Read synthetic data
list_df = ['df_synth_CopulaGAN.csv', 'df_synth_CTGAN.csv', 'df_synth_GaussianCopula.csv', 'df_synth_TVAE.csv']
df_synth_dict = read_csv_to_dict(list_df)

In [5]:
# Read real data
df_test = pd.read_csv('test_data.csv', index_col=0)

## Preprocessing

### Convert data to categorical

In [6]:
# categorical columns
cat_cols = ['GROUP', 'CATEGORY', 'PM NAME', 'SPONSOR NAME']

# Converting categorical columns to type 'category'
for name, df in df_synth_dict.items():
    for c in cat_cols:
        df[c] = df[c].astype('category')
    
# the same for validation data
for c in cat_cols:
    df_test[c] = df_test[c].astype('category')

In [7]:
# Preprocessing. OneHotEncoder for categorical columns
ohe = OneHotEncoder(sparse = False, handle_unknown = "ignore")

preprocessor = ColumnTransformer(transformers = [('cat', ohe, cat_cols)])

## Fit models to data

In [8]:
# For model estimation we will use crossvalidation and roc-auc metric
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)    

In [9]:
# Models

# Logistic regression
lr = LogisticRegression(penalty='l2', C=0.1)
clf_lr = Pipeline(steps=[('preprocessor', preprocessor), 
                    ('classifier', lr)])
    
# Random forest
rf = RandomForestClassifier(n_estimators=100)
clf_rf = Pipeline(steps=[('preprocessor', preprocessor), 
                    ('classifier', rf)])
    
#XGBoost
xgb = XGBClassifier(max_depth=2, gamma=2, eta=0.8, reg_alpha=0.5, reg_lambda=0.5)
clf_xgb = Pipeline(steps=[('preprocessor', preprocessor), 
                    ('classifier', xgb)])

In [10]:
# Dictionary with models
models_dict = {'Logistic Regression': clf_lr,
               'Random Forest': clf_rf,
               'XGBoost': clf_xgb}

In [11]:
def get_metric(models: dict, X_train: pd.DataFrame, y_train: pd.DataFrame) -> pd.DataFrame:
    results = []
    names = []
    for name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
        names.append(name)
        results.append(scores.mean())
    
    return pd.DataFrame([results], columns=names)


In [12]:
# Evaluate synthetic data, obtained from different models, by different metrics
def exp_run(models: dict, synth_data: dict) -> pd.DataFrame:
    names = []
    results = []
    for name, sdata in synth_data.items():
        # Split dataset into matrix X with features and vector y with target
        X_train, y_train = sdata.drop(['AUTHORIZATION'], axis=1), sdata['AUTHORIZATION']
        
        # Evaluate model  
        results.append(get_metric(models, X_train, y_train))
        names.append(name)
                
    return pd.concat(results).set_axis(names)

In [13]:
exp_run(models_dict, df_synth_dict)

Unnamed: 0,Logistic Regression,Random Forest,XGBoost
CopulaGAN,0.562147,0.518634,0.547211
CTGAN,0.515435,0.49439,0.513854
GaussianCopula,0.609563,0.576024,0.586107
TVAE,0.967677,0.968398,0.971965


The best synthetic model is TVAE. Let's validate this model on real data.

The best ML model is XGBoost

## Model validation

In [14]:
# Split dataset into matrix X with features and vector y with target

# test data
X_train, y_train = df_synth_dict['TVAE'].drop(['AUTHORIZATION'], axis=1), df_synth_dict['TVAE']['AUTHORIZATION']

# validation data
X_test, y_test = df_test.drop(['AUTHORIZATION'], axis = 1), df_test['AUTHORIZATION']


In [15]:
clf_xgb.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  ['GROUP', 'CATEGORY',
                                                   'PM NAME',
                                                   'SPONSOR NAME'])])),
                ('classifier',
                 XGBClassifier(eta=0.8, gamma=2, max_depth=2, reg_alpha=0.5,
                               reg_lambda=0.5))])

In [17]:
pred_holdout = clf_xgb.predict(X_test)

In [18]:
pred_holdout

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [19]:
roc_auc_score(y_test, pred_holdout)

0.75