In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('bank_customer_churn.csv')

In [3]:
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
features = ['country', 'gender', 'age', 'balance', 'products_number', 'active_member']

In [5]:
model_df = df[features]

In [6]:
model_df.head()

Unnamed: 0,country,gender,age,balance,products_number,active_member
0,France,Female,42,0.0,1,1
1,Spain,Female,41,83807.86,1,1
2,France,Female,42,159660.8,3,0
3,France,Female,39,0.0,2,0
4,Spain,Female,43,125510.82,1,1


In [7]:
model_df = pd.get_dummies(model_df, columns=['country', 'gender', 'products_number'], dtype=int)

In [8]:
model_df.head()

Unnamed: 0,age,balance,active_member,country_France,country_Germany,country_Spain,gender_Female,gender_Male,products_number_1,products_number_2,products_number_3,products_number_4
0,42,0.0,1,1,0,0,1,0,1,0,0,0
1,41,83807.86,1,0,0,1,1,0,1,0,0,0
2,42,159660.8,0,1,0,0,1,0,0,0,1,0
3,39,0.0,0,1,0,0,1,0,0,1,0,0
4,43,125510.82,1,0,0,1,1,0,1,0,0,0


In [12]:
from sklearn.model_selection import train_test_split

X = model_df
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [13]:
X.head()

Unnamed: 0,age,balance,active_member,country_France,country_Germany,country_Spain,gender_Female,gender_Male,products_number_1,products_number_2,products_number_3,products_number_4
0,42,0.0,1,1,0,0,1,0,1,0,0,0
1,41,83807.86,1,0,0,1,1,0,1,0,0,0
2,42,159660.8,0,1,0,0,1,0,0,0,1,0
3,39,0.0,0,1,0,0,1,0,0,1,0,0
4,43,125510.82,1,0,0,1,1,0,1,0,0,0


In [14]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: churn, dtype: int64

In [28]:
# Logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8425
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      1607
           1       0.70      0.35      0.47       393

    accuracy                           0.84      2000
   macro avg       0.78      0.66      0.69      2000
weighted avg       0.83      0.84      0.82      2000

Confusion Matrix:
[[1547   60]
 [ 255  138]]


In [27]:
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.813
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.88      1607
           1       0.53      0.49      0.51       393

    accuracy                           0.81      2000
   macro avg       0.70      0.69      0.70      2000
weighted avg       0.81      0.81      0.81      2000

Confusion Matrix:
[[1433  174]
 [ 200  193]]


In [24]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)

# Evaluate Random Forest Classifier
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_y_pred))

# Evaluate Gradient Boosting Classifier
gb_accuracy = accuracy_score(y_test, gb_y_pred)
print("\nGradient Boosting Accuracy:", gb_accuracy)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, gb_y_pred))
print("Gradient Boosting Confusion Matrix:")
print(confusion_matrix(y_test, gb_y_pred))


Random Forest Accuracy: 0.8425
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      1607
           1       0.63      0.49      0.55       393

    accuracy                           0.84      2000
   macro avg       0.75      0.71      0.73      2000
weighted avg       0.83      0.84      0.83      2000

Random Forest Confusion Matrix:
[[1493  114]
 [ 201  192]]

Gradient Boosting Accuracy: 0.8665
Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.96      0.92      1607
           1       0.74      0.49      0.59       393

    accuracy                           0.87      2000
   macro avg       0.81      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000

Gradient Boosting Confusion Matrix:
[[1540   67]
 [ 200  193]]
