In [146]:


import pandas as pd 
import numpy as np

In [147]:

df = pd.read_csv('Churn_Modelling.csv')

print("✅ Dataset loaded successfully!")
print(f"📏 Dataset size: {df.shape[0]} customers with {df.shape[1]} features")
print("🎯 Goal: Predict which customers will leave the bank (churn)")


✅ Dataset loaded successfully!
📏 Dataset size: 10000 customers with 14 features
🎯 Goal: Predict which customers will leave the bank (churn)


In [148]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [149]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [150]:
df.duplicated().sum()

np.int64(0)

In [151]:
df['Exited'].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [152]:
# 2037 people have exited the bank, which is 20.37% of the total dataset.

In [153]:
df['Gender'].value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [154]:

df.drop(columns=['RowNumber', 'CustomerId', 'Surname'], inplace=True)

In [155]:
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [156]:

df = pd.get_dummies(df, columns=['Geography','Gender'], drop_first=True)

In [157]:
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,False,False,False
1,608,41,1,83807.86,1,0,1,112542.58,0,False,True,False
2,502,42,8,159660.8,3,1,0,113931.57,1,False,False,False
3,699,39,1,0.0,2,0,0,93826.63,0,False,False,False
4,850,43,2,125510.82,1,1,1,79084.1,0,False,True,False


In [158]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('Exited', axis=1),  # All columns except 'Exited' 
    df['Exited'],               # Only the 'Exited' column (our target)
    test_size=0.2,              # 20% for testing
    random_state=1              # For reproducible results
)

In [159]:
X_train

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
2694,628,29,3,113146.98,2,0,1,124749.08,True,False,True
5140,626,29,4,105767.28,2,0,0,41104.82,False,False,False
2568,612,47,6,130024.87,1,1,1,45750.21,True,False,False
3671,646,52,6,111739.40,2,0,1,68367.18,True,False,False
7427,714,33,8,122017.19,1,0,0,162515.17,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...
2895,621,47,7,107363.29,1,1,1,66799.28,True,False,True
7813,684,63,3,81245.79,1,1,0,69643.31,True,False,False
905,672,45,9,0.00,1,1,1,92027.69,False,False,False
5192,663,39,8,0.00,2,1,1,101168.90,False,False,False


In [160]:
X_test

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_Germany,Geography_Spain,Gender_Male
9953,550,47,2,0.00,2,1,1,97057.28,False,False,True
3850,680,34,3,143292.95,1,1,0,66526.01,False,False,True
4962,531,42,2,0.00,2,0,1,90537.47,False,False,False
3886,710,34,8,147833.30,2,0,1,1561.58,True,False,True
5437,543,30,6,73481.05,1,1,1,176692.65,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
3919,763,39,7,0.00,2,1,0,19458.75,False,True,False
162,800,49,7,108007.36,1,0,0,47125.11,False,False,False
7903,567,34,10,0.00,2,0,1,161571.79,False,True,True
2242,621,30,2,101014.08,2,1,1,165257.31,True,False,False


In [161]:
y_train

2694    0
5140    0
2568    1
3671    0
7427    0
       ..
2895    0
7813    1
905     1
5192    0
235     1
Name: Exited, Length: 8000, dtype: int64

In [162]:
y_test

9953    0
3850    0
4962    0
3886    0
5437    0
       ..
3919    0
162     0
7903    0
2242    0
2745    0
Name: Exited, Length: 2000, dtype: int64

In [163]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 11), (2000, 11), (8000,), (2000,))

In [164]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Learn scaling from training data
X_test_scaled = scaler.transform(X_test)        # Apply same scaling to test data


In [165]:
X_train_scaled

array([[-0.23082038, -0.94449979, -0.70174202, ...,  1.71490137,
        -0.57273139,  0.91509065],
       [-0.25150912, -0.94449979, -0.35520275, ..., -0.58312392,
        -0.57273139, -1.09278791],
       [-0.3963303 ,  0.77498705,  0.33787579, ...,  1.71490137,
        -0.57273139, -1.09278791],
       ...,
       [ 0.22433188,  0.58393295,  1.3774936 , ..., -0.58312392,
        -0.57273139, -1.09278791],
       [ 0.13123255,  0.01077067,  1.03095433, ..., -0.58312392,
        -0.57273139, -1.09278791],
       [ 1.1656695 ,  0.29735181,  0.33787579, ...,  1.71490137,
        -0.57273139,  0.91509065]])

In [166]:
# 🧠 STEP 8: Import Deep Learning Tools
# TensorFlow/Keras: Google's library for building neural networks (like brain-inspired AI)

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential     
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential


In [167]:
model = Sequential()


In [168]:

model.add(Dense(3, activation='sigmoid', input_dim=11))



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [169]:

model.add(Dense(1, activation='sigmoid'))

In [170]:
model.summary()

In [171]:

model.compile(
    loss='binary_crossentropy',  # How to measure mistakes
    optimizer='adam',            # How to improve from mistakes  
    metrics=['accuracy']         # What to track during training
)

In [172]:

model.fit(X_train_scaled, y_train, epochs=100)


Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7949 - loss: 0.5275
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7949 - loss: 0.5275
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8034 - loss: 0.4771
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8034 - loss: 0.4771
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7916 - loss: 0.4743
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7916 - loss: 0.4743
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7994 - loss: 0.4523
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7994 - loss: 0.4523
Epoch 5/100
[1m250/250[0m [32

<keras.src.callbacks.history.History at 0x1f998202850>

In [173]:
model.layers[0].get_weights()

[array([[ 0.04133958, -0.01401854,  0.02500717],
        [ 0.60813874,  3.3471503 , -0.06815195],
        [ 0.17973347,  0.05643424,  0.01703084],
        [ 1.1627493 ,  0.30596462, -0.8866696 ],
        [ 3.5818326 ,  0.77719176, -1.2095958 ],
        [ 0.04901799, -0.02073488, -0.00552606],
        [ 0.62983096,  0.41613737,  1.0667982 ],
        [-0.1045814 ,  0.10289695,  0.13407211],
        [-0.97805303, -0.2059337 , -0.2051201 ],
        [ 0.11120638, -0.15802579, -0.30739057],
        [ 0.07474242, -0.10451389,  0.40781057]], dtype=float32),
 array([ 1.8232484 , -0.50410163,  1.209701  ], dtype=float32)]

In [174]:
y_log = model.predict(X_test_scaled)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [175]:
y_pred = np.where(y_log > 0.5, 1, 0) 


In [176]:
print("👁️ Looking at actual customer predictions...")
print("="*60)

# Get first 10 customers from test set
sample_size = 10
sample_indices = range(sample_size)

# Create a results dataframe for easy viewing
results_df = pd.DataFrame({
    'Customer_Index': sample_indices,
    'Actual_Churn': y_test.iloc[sample_indices].values,
    'Predicted_Probability': y_log[sample_indices].flatten(),
    'Predicted_Churn': y_pred[sample_indices].flatten()
})

# Add interpretation
results_df['Interpretation'] = results_df.apply(lambda row: 
    f"{'✅ Correct' if row['Actual_Churn'] == row['Predicted_Churn'] else '❌ Wrong'} - "
    f"{'High Risk' if row['Predicted_Probability'] > 0.5 else 'Low Risk'}", axis=1)

print("📊 Sample Predictions (First 10 customers):")
print("-" * 60)
for idx, row in results_df.iterrows():
    actual = "Will Churn" if row['Actual_Churn'] == 1 else "Will Stay"
    predicted = "Will Churn" if row['Predicted_Churn'] == 1 else "Will Stay"
    prob = row['Predicted_Probability']
    
    print(f"Customer {idx+1}:")
    print(f"  🎯 Actual: {actual}")
    print(f"  🤖 AI Predicted: {predicted} (Probability: {prob:.3f})")
    print(f"  📊 {row['Interpretation']}")
    print()

# Summary statistics
correct_predictions = (results_df['Actual_Churn'] == results_df['Predicted_Churn']).sum()
print(f"📈 In this sample: {correct_predictions}/{sample_size} predictions were correct!")
print(f"💡 Sample accuracy: {correct_predictions/sample_size*100:.1f}%")

👁️ Looking at actual customer predictions...
📊 Sample Predictions (First 10 customers):
------------------------------------------------------------
Customer 1:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.139)
  📊 ✅ Correct - Low Risk

Customer 2:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.110)
  📊 ✅ Correct - Low Risk

Customer 3:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.111)
  📊 ✅ Correct - Low Risk

Customer 4:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.044)
  📊 ✅ Correct - Low Risk

Customer 5:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.129)
  📊 ✅ Correct - Low Risk

Customer 6:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.012)
  📊 ✅ Correct - Low Risk

Customer 7:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.258)
  📊 ✅ Correct - Low Risk

Customer 8:
  🎯 Actual: Will Stay
  🤖 AI Predicted: Will Stay (Probability: 0.051)
  📊 ✅

In [177]:
print("🔍 Looking for customers who actually left the bank...")
print("="*60)

# Find indices where customers actually churned (y_test = 1)
churned_indices = y_test[y_test == 1].index[:10]  # Get first 10 who churned

print("📊 Customers Who Actually Churned:")
print("-" * 60)

for i, idx in enumerate(churned_indices):
    # Get the position in the test arrays
    test_position = list(y_test.index).index(idx)
    
    actual = y_test.iloc[test_position]
    predicted_prob = y_log[test_position][0]
    predicted = y_pred[test_position][0]
    
    predicted_text = "Will Churn" if predicted == 1 else "Will Stay"
    result = "✅ Caught!" if predicted == 1 else "❌ Missed"
    risk_level = "High Risk" if predicted_prob > 0.5 else "Low Risk"
    
    print(f"Churned Customer {i+1}:")
    print(f"  🎯 Actual: Will Churn (they left!)")
    print(f"  🤖 AI Predicted: {predicted_text} (Probability: {predicted_prob:.3f})")
    print(f"  📊 {result} - {risk_level}")
    print()

# Calculate how many churners we caught
caught_churners = sum(1 for idx in churned_indices 
                     if y_pred[list(y_test.index).index(idx)][0] == 1)

print(f"📈 Churner Detection: Caught {caught_churners}/{len(churned_indices)} actual churners")
print(f"💡 Churn detection rate in this sample: {caught_churners/len(churned_indices)*100:.1f}%")
print(f"🎯 This shows why recall (catching churners) is important!")

🔍 Looking for customers who actually left the bank...
📊 Customers Who Actually Churned:
------------------------------------------------------------
Churned Customer 1:
  🎯 Actual: Will Churn (they left!)
  🤖 AI Predicted: Will Stay (Probability: 0.175)
  📊 ❌ Missed - Low Risk

Churned Customer 2:
  🎯 Actual: Will Churn (they left!)
  🤖 AI Predicted: Will Churn (Probability: 0.580)
  📊 ✅ Caught! - High Risk

Churned Customer 3:
  🎯 Actual: Will Churn (they left!)
  🤖 AI Predicted: Will Stay (Probability: 0.279)
  📊 ❌ Missed - Low Risk

Churned Customer 4:
  🎯 Actual: Will Churn (they left!)
  🤖 AI Predicted: Will Churn (Probability: 0.770)
  📊 ✅ Caught! - High Risk

Churned Customer 5:
  🎯 Actual: Will Churn (they left!)
  🤖 AI Predicted: Will Stay (Probability: 0.110)
  📊 ❌ Missed - Low Risk

Churned Customer 6:
  🎯 Actual: Will Churn (they left!)
  🤖 AI Predicted: Will Churn (Probability: 0.569)
  📊 ✅ Caught! - High Risk

Churned Customer 7:
  🎯 Actual: Will Churn (they left!)
  🤖 AI

In [178]:

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

print(f"🎯 Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

🎯 Accuracy: 0.8505 (85.05%)


In [179]:
print("💾 Saving our trained model and scaler for the web app...")

# Save the trained model
model.save('churn_model.h5')
print("✅ Model saved as 'churn_model.h5'")

# Save the scaler for preprocessing new data
import pickle
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
print("✅ Scaler saved as 'scaler.pkl'")

print("\n🎉 Ready for Streamlit!")
print("📱 Your model is now ready to be used in the web app!")
print("🚀 Run: streamlit run streamlit_app.py")



💾 Saving our trained model and scaler for the web app...
✅ Model saved as 'churn_model.h5'
✅ Scaler saved as 'scaler.pkl'

🎉 Ready for Streamlit!
📱 Your model is now ready to be used in the web app!
🚀 Run: streamlit run streamlit_app.py
