In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from collections import Counter

from imblearn.over_sampling import SMOTE, ADASYN, SMOTETomek, SMOTEENN
#from imblearn.over_sampling import SMOTE
#from imblearn.over_sampling import ADASYN
#from imblearn.combine import SMOTETomek
#from imblearn.combine import SMOTEENN

In [3]:
df = pd.read_csv("https://github.com/Swastik-25/Imbalanced-Data-with-SMOTE-Techniques/raw/main/churn_prediction.csv")
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction
0,1,2101,66,Male,0.0,self_employed,187.0,2,755,1458.71,...,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0,2019-05-21
1,2,2348,35,Male,0.0,self_employed,,2,3214,5390.37,...,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0,2019-11-01
2,4,2194,31,Male,0.0,salaried,146.0,2,41,3913.16,...,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0,NaT
3,5,2329,90,,,self_employed,1020.0,2,582,2291.91,...,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1,2019-08-06
4,6,1579,42,Male,2.0,self_employed,1494.0,3,388,927.72,...,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1,2019-11-03


In [4]:
df.shape

(28382, 21)

In [5]:
df['churn'].value_counts()

0    23122
1     5260
Name: churn, dtype: int64

## Data Preprocessing



1.   Missing Data Handeling
2.   Dummy Variables
3.   Train Test Split
4.   Scaler

In [6]:
df.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
last_transaction                     0
dtype: int64

In [8]:
# Convert Gender
dict_gender = {'Male': 1, 'Female':0}
df.replace({'gender': dict_gender}, inplace = True)

# Replace with -1 for missing gender
df['gender'] = df['gender'].fillna(-1)

# Replacing with max. occurence values
df['dependents'] = df['dependents'].fillna(0)
df['occupation'] = df['occupation'].fillna('self_employed')
df['city'] = df['city'].fillna(1020)

In [9]:
# Convert occupation to one hot encoded features
df = pd.concat([df,pd.get_dummies(df['occupation'],prefix = str('occupation'),prefix_sep='_')],axis = 1)
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,1,2101,66,1.0,0.0,self_employed,187.0,2,755,1458.71,...,0.2,1458.71,1458.71,0,2019-05-21,0,0,0,1,0
1,2,2348,35,1.0,0.0,self_employed,1020.0,2,3214,5390.37,...,100.56,6496.78,8787.61,0,2019-11-01,0,0,0,1,0
2,4,2194,31,1.0,0.0,salaried,146.0,2,41,3913.16,...,259.23,5006.28,5070.14,0,NaT,0,0,1,0,0
3,5,2329,90,-1.0,0.0,self_employed,1020.0,2,582,2291.91,...,2143.33,2291.91,1669.79,1,2019-08-06,0,0,0,1,0
4,6,1579,42,1.0,2.0,self_employed,1494.0,3,388,927.72,...,1538.06,1157.15,1677.16,1,2019-11-03,0,0,0,1,0


In [10]:
#x = df.drop('Attrition', axis=1)
x = df.drop(['churn','customer_id',
             'occupation',
             'last_transaction'],
            axis=1)

y = df['churn']

# Splitting the data into train and test
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [12]:
y_train.shape, y_test.shape

((22705,), (5677,))

In [13]:
y_train.value_counts()/len(y_train)

0    0.814666
1    0.185334
Name: churn, dtype: float64

In [14]:
y_test.value_counts()/len(y_test)

0    0.814691
1    0.185309
Name: churn, dtype: float64

In [15]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

## SMOTE Technique


In [18]:
def oversample_with_smote(X_train, y_train):
    """
    Oversample the training dataset using SMOTE.

    Parameters:
    - X_train: Features of the training dataset
    - y_train: Labels of the training dataset

    Returns:
    - X_train_sm: Oversampled features
    - y_train_sm: Oversampled labels
    """
    smt = SMOTE()
    X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)
    return X_train_sm, y_train_sm



# Example usage:
counter_before = Counter(y_train)
print('Before', counter_before)

X_train_sm, y_train_sm = oversample_with_smote(X_train, y_train)

counter_after = Counter(y_train_sm)
print('After', counter_after)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 18497, 1: 18497})


## ADASYN Technique

In [19]:
def adasyn_oversampling(X_train, y_train, random_state=130):
    # Display the class distribution before oversampling
    before_counter = Counter(y_train)
    print('Before', before_counter)

    # Oversampling the train dataset using ADASYN
    ada = ADASYN(random_state=random_state)
    X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

    # Display the class distribution after oversampling
    after_counter = Counter(y_train_ada)
    print('After', after_counter)

    return X_train_ada, y_train_ada



# Example usage
X_train_ada, y_train_ada = adasyn_oversampling(X_train, y_train)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 18497, 1: 17388})


## SMOTE + Tomek Links

In [20]:
def oversample_with_smotetomek(X, y, random_state=42):
    """
    Oversample the dataset using SMOTE + Tomek.

    Parameters:
    - X: Features of the dataset
    - y: Labels of the dataset
    - random_state: Seed for reproducibility (default is 42)

    Returns:
    - X_resampled: Resampled features
    - y_resampled: Resampled labels
    """

    # Print class distribution before oversampling
    counter_before = Counter(y)
    print('Before', counter_before)

    # Oversampling using SMOTE + Tomek
    smtom = SMOTETomek(random_state=random_state)
    X_resampled, y_resampled = smtom.fit_resample(X, y)

    # Print class distribution after oversampling
    counter_after = Counter(y_resampled)
    print('After', counter_after)

    return X_resampled, y_resampled




# Example usage:
X_train_smtom, y_train_smtom = oversample_with_smotetomek(X_train, y_train, random_state=139)


Before Counter({0: 18497, 1: 4208})
After Counter({0: 18090, 1: 18090})


## SMOTE + ENN

In [23]:
def oversample_with_smoteenn(X, y):
    """
    Oversample the dataset using SMOTE + ENN.

    Parameters:
    - X: Features of the dataset
    - y: Labels of the dataset

    Returns:
    - X_resampled: Resampled features
    - y_resampled: Resampled labels
    """

    # Print class distribution before oversampling
    counter_before = Counter(y)
    print('Before', counter_before)

    # Oversampling using SMOTE + ENN
    smenn = SMOTEENN()
    X_resampled, y_resampled = smenn.fit_resample(X, y)

    # Print class distribution after oversampling
    counter_after = Counter(y_resampled)
    print('After', counter_after)

    return X_resampled, y_resampled




# Example usage:
X_train_smenn, y_train_smenn = oversample_with_smoteenn(X_train, y_train)

Before Counter({0: 18497, 1: 4208})
After Counter({1: 14754, 0: 8995})


## The Big Function

In [24]:
def oversample_data(X, y, sampler, **sampler_params):
    """
    Oversample the dataset using the specified sampler.

    Parameters:
    - X: Features of the dataset
    - y: Labels of the dataset
    - sampler: Oversampling technique (e.g., 'smote', 'adasyn', 'smtom', 'smenn')
    - **sampler_params: Additional parameters specific to the chosen oversampler

    Returns:
    - X_resampled: Resampled features
    - y_resampled: Resampled labels
    """

    # Map the sampler name to the corresponding class
    sampler_mapping = {
        'smote': SMOTE,
        'adasyn': ADASYN,
        'smtom': SMOTETomek,
        'smenn': SMOTEENN
    }

    # Get the appropriate sampler class
    if sampler not in sampler_mapping:
        raise ValueError("Invalid oversampling technique. Choose from 'smote', 'adasyn', 'smtom', 'smenn'.")
    oversampler_class = sampler_mapping[sampler]

    # Print class distribution before oversampling
    counter_before = Counter(y)
    print('Before', counter_before)

    # Oversampling using the specified sampler
    oversampler = oversampler_class(**sampler_params)
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Print class distribution after oversampling
    counter_after = Counter(y_resampled)
    print('After', counter_after)

    return X_resampled, y_resampled



# Example usage:
# Choose the oversampling technique and provide additional parameters if needed
X_train_resampled, y_train_resampled = oversample_data(X_train, y_train, sampler='smote')


Before Counter({0: 18497, 1: 4208})
After Counter({0: 18497, 1: 18497})


[Ref](https://https://github.com/Swastik-25/Imbalanced-Data-with-SMOTE-Techniques/blob/main/Imbalanced_Data.ipynb)