In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Dense, Input, Dropout, Activation, Conv1D,Flatten
from keras.layers import MaxPooling1D,GlobalMaxPool1D
from keras.models import Sequential,Model

In [None]:
df = pd.read_csv("BankChurners.csv")
df.shape

In [None]:
df.Attrition_Flag = df.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})
df.Gender = df.Gender.replace({'F':1,'M':0})
df = pd.concat([df,pd.get_dummies(df['Education_Level']).drop(columns=['Unknown'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['Income_Category']).drop(columns=['Unknown'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['Marital_Status']).drop(columns=['Unknown'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['Card_Category']).drop(columns=['Platinum'])],axis=1)
df.drop(columns = ['Education_Level','Income_Category','Marital_Status','Card_Category','CLIENTNUM'],inplace=True)
df

In [None]:
#Encoder = "OneHotEncoder"
Encoder = "Label Encoder"
# OverSamplingTecnique = ""
OverSamplingTecnique = "SMOTE-Enn"
#OverSamplingTecnique = "SMOTE-Tomek"
#OverSamplingTecnique = "SMOTE"

In [None]:
if Encoder == "Label Encoder":
  print("Applying Label Encoder")
  df_final = df.copy()
  le = LabelEncoder()
  text_data_features = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count',
                     'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
                     'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct',
                     'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

  print('Label Encoder Transformation')
  for i in text_data_features :
      df_final[i] = le.fit_transform(df_final[i])
      print(i,' : ',df_final[i].unique(),' = ',le.inverse_transform(df_final[i].unique()))


X = df_final.drop(['Attrition_Flag'], axis=1).copy()
Y = df_final['Attrition_Flag'].copy().astype(int)

In [None]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X)
X=X_resampled_scaled
Y=Y

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

font_size = 20
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['axes.labelsize'] = font_size
plt.rcParams['axes.titlesize'] = font_size + 2
plt.rcParams['xtick.labelsize'] = font_size - 2
plt.rcParams['ytick.labelsize'] = font_size - 2
plt.rcParams['legend.fontsize'] = font_size - 2

colors = ['#00A5E0', '#DD403A']

fig, ax = plt.subplots(figsize=(5, 4))

sns.countplot(x='Attrition_Flag', data=df_final, palette=colors, ax=ax)

for index, value in enumerate(df_final['Attrition_Flag'].value_counts()):
    label = '{}%'.format(round((value / df_final['Attrition_Flag'].shape[0]) * 100, 2))
    ax.annotate(label,
                xy=(index, value + 250),
                ha='center',
                va='center',
                color=colors[index], weight='bold',
                size=font_size + 4)

ax.set_xticklabels(['Retained', 'Churned'], fontweight='bold')
ax.set_xlabel('Status', fontweight='bold')
ax.set_ylabel('Count', fontweight='bold')
ax.set_ylim([0, 10000]);

In [None]:
if OverSamplingTecnique == "SMOTE":
  print("Applying SMOTE")
  smote = SMOTE()

  X_resampled, y_resampled = smote.fit_resample(X, Y)
  scaler = StandardScaler()
  X_resampled_scaled = scaler.fit_transform(X_resampled)
  X=X_resampled_scaled
  Y=y_resampled

In [None]:
if OverSamplingTecnique == "SMOTE-Tomek":
  print("Applying SMOTE-Tomek")

  smote_tomek = SMOTETomek()
  X_resampled, y_resampled = smote_tomek.fit_resample(X, Y)
  scaler = StandardScaler()
  X_resampled_scaled = scaler.fit_transform(X_resampled)
  X=X_resampled_scaled
  Y=y_resampled

In [None]:
if OverSamplingTecnique == "SMOTE-Enn":
  print("Applying SMOTE-Enn")

  smote_enn = SMOTEENN()
  X_resampled, y_resampled = smote_enn.fit_resample(X, Y)
  scaler = StandardScaler()
  X_resampled_scaled = scaler.fit_transform(X_resampled)
  X=X_resampled_scaled
  Y=y_resampled

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')

font_size = 20
plt.rcParams['font.weight'] = 'bold'
plt.rcParams['axes.linewidth'] = 1
plt.rcParams['axes.labelsize'] = font_size
plt.rcParams['axes.titlesize'] = font_size + 2
plt.rcParams['xtick.labelsize'] = font_size - 2
plt.rcParams['ytick.labelsize'] = font_size - 2
plt.rcParams['legend.fontsize'] = font_size - 2

colors = ['#00A5E0', '#DD403A']

fig, ax = plt.subplots(figsize=(5, 4))

X_resampled_scaled_df = pd.DataFrame(X_resampled_scaled)
X_resampled_scaled_df['Attrition_Flag'] = y_resampled

sns.countplot(x='Attrition_Flag', data=X_resampled_scaled_df, palette=colors, ax=ax)

for index, value in enumerate(y_resampled.value_counts()):
    label = '{}%'.format(round((value / y_resampled.shape[0]) * 100, 2))
    ax.annotate(label,
                xy=(index, value + 250),
                ha='center',
                va='center',
                color=colors[index], weight='bold',
                size=font_size + 4)

ax.set_xticklabels(['Retained', 'Churned'], fontweight='bold')
ax.set_xlabel('Status', fontweight='bold')
ax.set_ylabel('Count', fontweight='bold')
ax.set_ylim([0, 10000]);

In [None]:
def feature_extractor_from_dl_model(model, data_seq, layer_num):
    """
    Create a new representation of the data by extracting the output of the given DL layer_num
    Parameters
    ----------
    @param model: DL model
    @param data_seq: DNA sequences for extracting DL representation
    @param layer_num: the output of this layer is used as feature representation
    """
    # Get the input tensor of the model
    input_tensor = model.layers[0].input
    # Create a new model that only includes the desired layer
    new_model = tf.keras.models.Model(inputs=input_tensor, outputs=model.layers[layer_num].output)
    layer_outs = new_model.predict(data_seq)
    out_array = np.array(layer_outs)
    out_array = out_array.flatten()
    out_array = np.reshape(out_array, (data_seq.shape[0], -1))
    return out_array

In [None]:
random_state = 42
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D,Flatten,GRU,SimpleRNN,Concatenate

def kfold(Model_No):
  print("Applying K-fold")


  # Assuming X and Y are your input and target data
  # Define the number of folds
  num_folds = 2

  # Initialize lists to store the evaluation results
  accuracy_scores = []
  precision_scores = []
  recall_scores = []
  f1_scores = []

  # Perform stratified k-fold cross-validation
  fold_number = 1  # Initialize the fold number
  skf = StratifiedKFold(n_splits=num_folds, shuffle=True)
  for train_index, test_index in skf.split(X, Y):
      print(f"Fold {fold_number}/{num_folds}:")
      # Split the data into training and test sets for the current fold
      X_train, X_test = X[train_index], X[test_index]
      # X_train, X_test = X[train_index], X[test_index]
      Y_train, Y_test = Y[train_index], Y[test_index]

    #  if Model_No==1: # LogisticRegression
    #    model = LogisticRegression(random_state=random_state)


   #     model.fit(X_train, Y_train)

   #   elif Model_No==2: # SVC
    #    model = SVC(random_state=random_state)

 #       model.fit(X_train, Y_train)
   #   elif Model_No==3: # RandomForestClassifier
    #   model = RandomForestClassifier(random_state=random_state)

      #  model.fit(X_train, Y_train)
      if Model_No==1:
        model=Sequential()
        model.add(Input(shape=(X_train.shape[1], 1)))
        model.add(Conv1D(filters=128,kernel_size=3,padding='same',activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=64,kernel_size=3,padding='same',activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=32,kernel_size=3,padding='same',activation='relu'))
        model.add(GlobalMaxPool1D())
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(32, activation='tanh'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile and train the model
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, Y_train, epochs=30, batch_size=32, verbose=1,validation_split=0.2)
      elif Model_No==2:
        model=Sequential()
        model.add(Input(shape=(X_train.shape[1], 1)))
        model.add(Conv1D(filters=128,kernel_size=3,padding='same',activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=64,kernel_size=3,padding='same',activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Conv1D(filters=32,kernel_size=3,padding='same',activation='relu'))
        model.add(GlobalMaxPool1D())
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(32, activation='tanh'))
        model.add(Dense(16, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        # Compile and train the model
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
        model.compile(optimizer= optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, Y_train, epochs=3, batch_size=32, verbose=1,validation_split=0.2)
        out_array=feature_extractor_from_dl_model(model, X_train, -6)
        model2 = RandomForestClassifier(random_state=random_state)
        model2.fit(out_array, Y_train)
        out_array=feature_extractor_from_dl_model(model, X_test, -6)
        X_test=out_array
        model=model2
      elif Model_No==3:
        model=Sequential()
        model.add(Input(shape=(X_train.shape[1], 1)))
        model.add(LSTM(128, return_sequences=True))
        model.add(LSTM(16))

     # Compile and train the model
      # optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='ADAM', loss='binary_crossentropy', metrics=['accuracy'])
        history=model.fit(X_train, Y_train, epochs=10, batch_size=32, verbose=1,validation_split=0.2)
      Y_pred = model.predict(X_test)
      Y_pred_binary = np.round(Y_pred).flatten()

      # Calculate evaluation metrics
      accuracy = accuracy_score(Y_test, Y_pred_binary)
      precision = precision_score(Y_test, Y_pred_binary)
      recall = recall_score(Y_test, Y_pred_binary)
      f1 = f1_score(Y_test, Y_pred_binary)

      print(' Test--------- | accuracy:{:<6.4f} | precision:{:<6.4f} |recall:{:<6.4f} | F1:{:<6.4f}'.format( accuracy, precision, recall, f1))
      # Append the scores to the respective lists
      accuracy_scores.append(accuracy)
      precision_scores.append(precision)
      recall_scores.append(recall)
      f1_scores.append(f1)
      fold_number += 1


  # Calculate the average scores
  avg_accuracy = np.mean(accuracy_scores)
  avg_precision = np.mean(precision_scores)
  avg_recall = np.mean(recall_scores)
  avg_f1 = np.mean(f1_scores)

  # Print the average scores
  print(avg_accuracy)
  print(avg_precision)
  print(avg_recall)
  print(avg_f1)
  

In [None]:
kfold(1)