# Data Balancing

## 1. Setup

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from summarytools import dfSummary
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from collections import Counter
from sklearn.model_selection import train_test_split

from env_setup import *
from functions.t_test import *

pd.set_option('display.max_columns', None)

In [3]:
# Load data
df_raw = pd.read_csv(fr"{dataout}//{dataset}_FE.csv")
df_raw.head()
df_ori = df_raw.copy()

In [6]:
# Split categorical and numeric
l_cols_cat = [i for i in df_ori if df_ori[i].dtype==object]
l_cols_num = [i for i in df_ori if df_ori[i].dtype!=object and i != 'Churned']

print(f"Categorical columns: {', '.join(l_cols_cat)}")
print(f"Numeric columns: {', '.join(l_cols_num)}")

Categorical columns: Gender, Location, TransactionType, Age_bin, Income_bin, Tenure_bin, TransactionDate_bin, TransactionAmount_bin
Numeric columns: Age, Income, Tenure, TransactionDate, TransactionAmount, NumSupportCalls, NumComplaints, Age_norm, Income_norm, Tenure_norm, TransactionDate_norm, TransactionAmount_norm, NumSupportCalls_norm, NumComplaints_norm, Gender_Female, Gender_Male, Location_Rural, Location_Suburban, Location_Urban, TransactionType_Purchase, TransactionType_Refund, TransactionType_Subscription Renewal, TransactionType_Support Fee, TransactionType_Upgrade, Age_bin_(0, 10], Age_bin_(10, 20], Age_bin_(20, 30], Age_bin_(30, 40], Age_bin_(40, 50], Age_bin_(50, 60], Age_bin_(60, 70], Income_bin_(0, 10000], Income_bin_(10000, 20000], Income_bin_(20000, 30000], Income_bin_(30000, 40000], Income_bin_(40000, 50000], Income_bin_(50000, 60000], Income_bin_(60000, 70000], Income_bin_(70000, 80000], Income_bin_(80000, 90000], Income_bin_(90000, 100000], Income_bin_(100000, 11000

## 2. Train-Test Split

In [7]:
# Remove categorical columns
df_ori.drop(columns=l_cols_cat, inplace=True)

In [8]:
# Splitting the data into train and test
x = df_ori.drop(columns=['Churned'])
y = df_ori['Churned']
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=12345)

## 3. Balancing

### 2.1. Synthetic minority over-sampling technique (SMOTE)

In [9]:
counter = Counter(y_train)
print('Before', counter)

# oversampling the train dataset using SMOTE
smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After', counter)

Before Counter({0: 574149, 1: 223955})
After Counter({0: 574149, 1: 574149})


### 2.2. SMOTE + ENN

In [10]:
counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE + ENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

counter = Counter(y_train_smenn)
print('After',counter)

Before Counter({0: 574149, 1: 223955})
After Counter({1: 388751, 0: 256798})


## 3. Export

In [16]:
X_test['Churned'] = y_test
X_test.to_csv(fr"{dataout}//{dataset}_test.csv", index=False)

In [17]:
X_train_sm['Churned'] = y_train_sm
X_train_sm.to_csv(fr"{dataout}//{dataset}_SMOTE_train.csv", index=False)

X_train_smenn['Churned'] = y_train_smenn
X_train_smenn.to_csv(fr"{dataout}//{dataset}_SMOTEENN_train.csv", index=False)