### Import Bulian library and dependencies

In [2]:
sys.path.insert(1,r'F:\Users\Kaggle\bulian')

In [3]:
import os,sys,torch
import pandas as pd
from bulian.Tabular.synthesizers import TwinSynthesizer,PrivateTwinSynthesizer
from bulian.metrics import *
from bulian.metrics.reports import *
from bulian.metrics import compute_metrics
from bulian.metrics.single_table import SingleTableMetric
from bulian.metrics.single_table import *

In [4]:
import warnings
warnings.filterwarnings('ignore')

### Read raw CSV data

In [5]:
churn = pd.read_csv(r"F:\Users\Kaggle\bulian\examples\csv\Churn.csv")

In [6]:
churn.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Define column types for ingestion by Bulian models

In [7]:
discrete_columns = [
   'Geography',
   'Gender',
   'HasCrCard',
   'IsActiveMember',
   'Exited'
]

target = 'Exited'

numeric_features = [i for i in churn.columns if i not in discrete_columns]

### Check if GPU is available

In [8]:
print(torch.cuda.is_available(),torch.cuda.current_device(),torch.cuda.get_device_name(0))

True 0 NVIDIA GeForce RTX 3090


In [9]:
!nvidia-smi

Tue Jun 21 16:37:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 511.79       Driver Version: 511.79       CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   49C    P8    23W / 370W |    658MiB / 24576MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

###### 

### Check data statitics

In [10]:
churn.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [11]:
churn['Exited'].value_counts(normalize=True)

0    0.7963
1    0.2037
Name: Exited, dtype: float64

In [12]:
churn.shape

(10000, 11)

### Build a Bulian.ai synthetic data model to boost class imbalance in churn dataset


In [13]:
synth = TwinSynthesizer(batch_size=200)

In [14]:
synth.fit(data=churn,epochs=100,discrete_columns=discrete_columns)

### Sample 5000 new synthetic rows 

In [None]:
sample = synth.sample(5000)

### Check metrics for newly created synthetic data

In [None]:
metrics = SingleTableMetric.get_subclasses()
compute_metrics(metrics,churn, sample)

#### Report with ML efficacy metrics, as that is the key consideration in this case

In [None]:
get_full_report(churn, sample,discrete_columns,numeric_features,target='Exited')

### Test churn ML Performance on standalone real data vs real+synthetic data

In [None]:
sample = sample[sample['Exited']==1].reset_index(drop=True)

In [None]:
real_data = churn
synthetic_data = sample
real_with_synthetic = pd.concat([real_data,synthetic_data],0).reset_index(drop=True) 

### Random Forest model on real vs real+synthetic data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [None]:
churn.select_dtypes('object').columns

In [None]:
### Convert string cols to interets with Label Encoders

O_cols = churn.select_dtypes('object').columns

for o in O_cols:
    LE = LabelEncoder()
    real_data[o] = LE.fit_transform(real_data[o])
    real_with_synthetic[o]=LE.transform(real_with_synthetic[o])

In [None]:
real_X_train, real_X_test,real_Y_train,real_Y_test = train_test_split(real_data.drop(target,1),real_data[target],stratify=real_data[target])
combined_X_train, combined_X_test,combined_Y_train,combined_Y_test = train_test_split(real_with_synthetic.drop(target,1),real_with_synthetic[target],stratify=real_with_synthetic[target])

In [None]:
rf_real = RandomForestClassifier()
rf_real.fit(real_X_train,real_Y_train)
probs = rf_real.predict_proba(real_X_test)[:,1]

print(f'AUC Score with real only data:{roc_auc_score(real_Y_test,probs)}')

In [None]:
rf_real = RandomForestClassifier()
rf_real.fit(combined_X_train,combined_Y_train)
probs = rf_real.predict_proba(combined_X_test)[:,1]

print(f'AUC Score with combined data:{roc_auc_score(combined_Y_test,probs)}')

### Fin ###