In [5]:
# Display the column names of the DataFrame
for column_name in df.columns:
    print(column_name)

CLIENTNUM
Attrition_Flag
Customer_Age
Gender
Dependent_count
Education_Level
Marital_Status
Income_Category
Card_Category
Months_on_book
Total_Relationship_Count
Months_Inactive_12_mon
Contacts_Count_12_mon
Credit_Limit
Total_Revolving_Bal
Avg_Open_To_Buy
Total_Amt_Chng_Q4_Q1
Total_Trans_Amt
Total_Trans_Ct
Total_Ct_Chng_Q4_Q1
Avg_Utilization_Ratio
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2


In [11]:
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


#### Legend

| Column Name                                            | Explanation                                                                   |
|-------------------------------------------------------|-------------------------------------------------------------------------------|
| CLIENTNUM                                             | Client number. Unique identifier for the customer holding the account         |
| Attrition_Flag                                        | Internal event (customer activity) variable - if the account is closed then 1 else 0 |
| Customer_Age                                          | Demographic variable - Customer's Age in Years                                |
| Gender                                                | Demographic variable - M=Male, F=Female                                       |
| Dependent_count                                       | Demographic variable - Number of dependents                                    |
| Education_Level                                       | Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.) |
| Marital_Status                                        | Demographic variable - Married, Single, Divorced, Unknown                     |
| Income_Category                                       | Demographic variable - Annual Income Category of the account holder (< $40K, $40K - 60K, $60K - $80K, $80K-$120K, > $120K, Unknown) |
| Card_Category                                         | Product Variable - Type of Card (Blue, Silver, Gold, Platinum)                 |
| Months_on_book                                       | Period of relationship with bank                                              |
| Total_Relationship_Count                             | Total no. of products held by the customer                                    |
| Months_Inactive_12_mon                               | No. of months inactive in the last 12 months                                  |
| Contacts_Count_12_mon                                | No. of Contacts in the last 12 months                                         |
| Credit_Limit                                         | Credit Limit on the Credit Card                                               |
| Total_Revolving_Bal                                  | Total Revolving Balance on the Credit Card                                    |
| Avg_Open_To_Buy                                      | Open to Buy Credit Line (Average of last 12 months)                            |
| Total_Amt_Chng_Q4_Q1                                 | Change in Transaction Amount (Q4 over Q1)                                      |
| Total_Trans_Amt                                      | Total Transaction Amount (Last 12 months)                                      |
| Total_Trans_Ct                                       | Total Transaction Count (Last 12 months)                                      |
| Total_Ct_Chng_Q4_Q1                                  | Change in Transaction Count (Q4 over Q1)                                      |
| Avg_Utilization_Ratio                                | Average Card Utilization Ratio                                                |
| Naive_Bayes_Classifier_(...)_mon_1 | Naive Bayes |
| Naive_Bayes_Classifier_(...)_mon_2 | Naive Bayes |


### 1. Data preprocessing

#### 1.0. Prepare data
Prepare the dataset for further analysis by reading the data into a DataFrame.

In [1]:
import pandas as pd

file_path = '../data/BankChurners.csv'
df = pd.read_csv(file_path)

#### 1.1. Data cleaning
Check for missing values, duplicates, and inconsistent data. Remove or impute missing values as needed.

In [7]:
# Remove rows with missing values
df.dropna(inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

Number of Duplicate Rows: 0


#### 1.2. Feature selection
Identify relevant features for classification. Drop columns that are not useful or redundant for the analysis.

In [None]:
columns_to_drop = [
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'
]

df = df.drop(columns=columns_to_drop)

#### 1.3. Encoding categorical variables
Convert categorical variables (e.g., Gender, Education_Level, Marital_Status, Income_Category, Card_Category) into numerical format using one-hot encoding.

First the nominal data.

In [None]:
from sklearn.preprocessing import OneHotEncoder

nominal_columns = ['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status']

data_to_encode = df[nominal_columns]
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_data = encoder.fit_transform(data_to_encode)

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(input_features=nominal_columns))

# Drop the original categorical columns from the original DataFrame
df = df.drop(columns=nominal_columns)

# Concatenate the encoded DataFrame with the original DataFrame
df = pd.concat([df, encoded_df], axis=1)

Then the ordinal data.

In [16]:
from sklearn.preprocessing import LabelEncoder

ordinal_columns = ['Income_Category', 'Card_Category']

label_encoders = {}

for column in ordinal_columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

#### 1.4. Scaling features
Normalize or standardize numerical features to ensure they have similar scales. 

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

numerical_columns = [
    'Customer_Age', 'Dependent_count', 'Months_on_book', 'Credit_Limit',
    'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
    'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1',
    'Avg_Utilization_Ratio'
]

# Choose the scaling method: 'standardization' or 'normalization'
scaling_method = 'standardization'  # Change this to 'normalization' if needed

if scaling_method == 'standardization':
    scaler = StandardScaler()
elif scaling_method == 'normalization':
    scaler = MinMaxScaler()
else:
    raise ValueError("Invalid scaling method. Use 'standardization' or 'normalization'.")

# Fit and transform the scaler on the numerical columns
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

#### 1.5. Splitting data
Divide the dataset into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split

# Specify the features (X) and the target variable (y)
X = df.drop(columns=['Attrition_Flag_Existing Customer'])  
y = df['Attrition_Flag_Existing Customer']  

# Split the data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### 2. Churn Rate Analysis

In [22]:
# Calculate the number of churned customers
churned_count = df['Attrition_Flag_Existing Customer'].value_counts()[0]  # Assuming 0 corresponds to churned

# Calculate the total number of customers
total_customers = len(df)

# Calculate the churn rate
churn_rate = (churned_count / total_customers) * 100
print(f"Current Churn Rate: {churn_rate:.2f}%")

Current Churn Rate: 16.07%


#### 2.1 Implement multiple classification algorithms and compare their performance

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score

# Define a list of classification models to try
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(),
}

# Loop over the models, train them, and evaluate their performance
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate classification report
    report = classification_report(y_test, y_pred, zero_division='warn')
    
    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # Print the evaluation metrics
    print(f"{model_name} Evaluation:")
    print("Classification Report:")
    print(report)
    print(f"ROC AUC Score: {roc_auc:.2f}")
    print("="*40)


Training Logistic Regression...
Logistic Regression Evaluation:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       327
         1.0       0.84      1.00      0.91      1699

    accuracy                           0.84      2026
   macro avg       0.42      0.50      0.46      2026
weighted avg       0.70      0.84      0.76      2026

ROC AUC Score: 0.50
Training Decision Tree...
Decision Tree Evaluation:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.78      0.77      0.78       327
         1.0       0.96      0.96      0.96      1699

    accuracy                           0.93      2026
   macro avg       0.87      0.87      0.87      2026
weighted avg       0.93      0.93      0.93      2026

ROC AUC Score: 0.87
Training Random Forest...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Evaluation:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.78      0.85       327
         1.0       0.96      0.99      0.97      1699

    accuracy                           0.95      2026
   macro avg       0.94      0.88      0.91      2026
weighted avg       0.95      0.95      0.95      2026

ROC AUC Score: 0.88
Training Gradient Boosting...
Gradient Boosting Evaluation:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.84      0.88       327
         1.0       0.97      0.99      0.98      1699

    accuracy                           0.96      2026
   macro avg       0.95      0.91      0.93      2026
weighted avg       0.96      0.96      0.96      2026

ROC AUC Score: 0.91
Training Support Vector Machine...
Support Vector Machine Evaluation:
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
