# Data Balancing

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from summarytools import dfSummary
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.model_selection import train_test_split

from env_setup import *
from functions.t_test import *

pd.set_option('display.max_columns', None)

In [2]:
# Load data
df_raw = pd.read_csv(fr"{dataout}//{dataset}_FE.csv")
df_raw.head()
df_ori = df_raw.copy()

In [3]:
# Split categorical and numeric
l_cols_cat = [i for i in df_ori if df_ori[i].dtype==object]
l_cols_num = [i for i in df_ori if df_ori[i].dtype!=object and i != 'Churned']

print(f"Categorical columns: {', '.join(l_cols_cat)}")
print(f"Numeric columns: {', '.join(l_cols_num)}")

Categorical columns: Gender, Location
Numeric columns: Age, Income, Tenure, NumSupportCalls, NumComplaints, Purchase, Refund, Subscription Renewal, Support Fee, Upgrade, txn_mean, txn_count, Age_norm, Income_norm, Tenure_norm, NumSupportCalls_norm, NumComplaints_norm, Purchase_norm, Refund_norm, Subscription Renewal_norm, Support Fee_norm, Upgrade_norm, txn_mean_norm, txn_count_norm, Gender_Female, Gender_Male, Location_Rural, Location_Suburban, Location_Urban


## 2. Train-Test Split

In [4]:
# Remove categorical columns
df_ori.drop(columns=l_cols_cat, inplace=True)

In [5]:
# Splitting the data into train and test
x = df_ori.drop(columns=['Churned'])
y = df_ori['Churned']
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=12345)

## 3. Balancing

### 2.1. Synthetic minority over-sampling technique (SMOTE)

In [7]:
counter = Counter(y_train)
print('Before', counter)

# oversampling the train dataset using SMOTE
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After', counter)

Before Counter({0: 57538, 1: 22462})
After Counter({0: 57538, 1: 57538})


### 2.2. SMOTE + ENN

In [8]:
counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE + ENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

counter = Counter(y_train_smenn)
print('After',counter)

Before Counter({0: 57538, 1: 22462})
After Counter({1: 36938, 0: 17395})


## 3. Export

In [9]:
X_test['Churned'] = y_test
X_test.to_csv(fr"{dataout}//{dataset}_test.csv", index=False)

In [10]:
X_train['Churned'] = y_train
X_train.to_csv(fr"{dataout}//{dataset}_train.csv", index=False)

X_train_sm['Churned'] = y_train_sm
X_train_sm.to_csv(fr"{dataout}//{dataset}_SMOTE_train.csv", index=False)

X_train_smenn['Churned'] = y_train_smenn
X_train_smenn.to_csv(fr"{dataout}//{dataset}_SMOTEENN_train.csv", index=False)

In [15]:
df_ori_us = pd.concat([
    df_ori[df_ori['Churned']==1],
    df_ori[df_ori['Churned']==0].sample(len(df_ori[df_ori['Churned']==1]), random_state=12345)
    ])
df_ori_us.groupby('Churned')['Churned'].count()


Churned
0    28078
1    28078
Name: Churned, dtype: int64

In [16]:
x = df_ori_us.drop(columns=['Churned'])
y = df_ori_us['Churned']
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=12345)

In [22]:
X_test['Churned'] = y_test
X_test.to_csv(fr"{dataout}//{dataset}_us_test.csv", index=False)

X_train['Churned'] = y_train
X_train.to_csv(fr"{dataout}//{dataset}_us_train.csv", index=False)