In [4]:
%pip install imblearn
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN, SMOTE

In [28]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42.0,2,0.0,1,1.0,1.0,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41.0,1,83807.86,1,0.0,1.0,112542.58,0
2,3,15619304,Onio,502,France,Female,42.0,8,159660.8,3,1.0,0.0,113931.57,1
3,4,15701354,Boni,699,France,Female,39.0,1,0.0,2,0.0,0.0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43.0,2,125510.82,1,,1.0,79084.1,0


In [29]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 976.8+ KB


In [30]:
# Dropping unnecessary columns
data.drop('CustomerId', axis=1, inplace=True)
data.drop('Surname', axis=1, inplace=True)
data.drop('RowNumber', axis=1, inplace=True)

In [31]:
# Check missing values
missing_values = data.isnull().sum()
print(missing_values)

CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64


In [32]:
# Drop rows with any missing values as the count is very less
data.dropna(inplace=True)

In [33]:
# Verify missing values in each column
missing_values = data.isnull().sum()
print(missing_values)

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [34]:
# Encode categorical variables

# Fit the encoders on the training data
geography_encoder = LabelEncoder()
data['Geography'] = geography_encoder.fit_transform(data['Geography'])

gender_encoder = LabelEncoder()
data['Gender'] = gender_encoder.fit_transform(data['Gender'])

In [35]:
# Feature scaling

numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

scaler = StandardScaler()

data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [36]:
# Separating Features and Target
X = data.drop('Exited', axis=1)
y = data['Exited']

In [37]:
# Balancing dataset
smote = SMOTE(k_neighbors=5)
# Resample the data
X_resampled, y_resampled = smote.fit_resample(X, y)

In [38]:
# Split data 
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [39]:
print("Training Data Size: ", X_train.shape)
print("Test Data Size: ", X_test.shape)

Training Data Size:  (12736, 10)
Test Data Size:  (3184, 10)


In [40]:
# XGBoost Model Training
import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X_train, y_train)

In [41]:
# XGBoost Model evaluation
y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1-Score:", f1)

Confusion Matrix:
 [[1492  131]
 [ 196 1365]]
Accuracy: 0.8972989949748744
F1-Score: 0.8930323846908734


In [42]:
# RandomForest Model Training
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [43]:
# RandomForest Model evaluation
y_pred = model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("F1-Score:", f1)

Confusion Matrix:
 [[1463  160]
 [ 164 1397]]
Accuracy: 0.8982412060301508
F1-Score: 0.8960872354073125


In [44]:
# Adding new data for test
new_user_data = {
    "CreditScore": 645,  
    "Geography": ["Spain"],  
    "Gender": ["Male"],  
    "Age": 44,  
    "Tenure": 2,  
    "Balance": 132202.88,  
    "NumOfProducts": 2,  
    "HasCrCard": 0,  
    "IsActiveMember": 0,  
    "EstimatedSalary": 79084.1,  
}

In [45]:
# Create a DataFrame from the dictionary
new_user_df = pd.DataFrame(new_user_data, index=[0])  # Single row DataFrame

# Encode categorical features (assuming same encoding as in training)
geography_encoder = LabelEncoder()
new_user_df['Geography'] = geography_encoder.fit_transform(new_user_df['Geography'])

gender_encoder = LabelEncoder()
new_user_df['Gender'] = gender_encoder.fit_transform(new_user_df['Gender'])

# Feature scaling (assuming same scaling as in training)
scaler = StandardScaler()
new_user_df[numerical_features] = scaler.fit_transform(new_user_df[numerical_features])

In [46]:
# Make the prediction
prediction = model.predict(new_user_df)[0]

# Interpret the prediction (0: not churn, 1: churn)
if prediction == 0:
    print("Prediction: The user is unlikely to churn (leave).")
else:
    print("Prediction: The user has a higher chance of churning (leaving).")

Prediction: The user is unlikely to churn (leave).


In [47]:
# The percentage of user staying or leaving
# Make the prediction using the trained model
prediction = model.predict_proba(new_user_df)[0]

# Extract the probability of leaving 
churn_probability = prediction[1]  

# Format the probability as a percentage with two decimal places
probability_formatted = f"{churn_probability * 100:.2f}%"

# Print the prediction in terms of churn probability
print(f"Prediction: The user has a {probability_formatted} chance of churning (leaving).")

Prediction: The user has a 31.00% chance of churning (leaving).
