<a href="https://colab.research.google.com/github/bubu2003/customerClassification/blob/main/CustomerClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of customers
n_customers = 500

# Generate synthetic features
ages = np.random.randint(18, 70, n_customers)
genders = np.random.randint(0, 2, n_customers)  # 0: Female, 1: Male
income = np.random.normal(50, 15, n_customers).round(2)  # Mean: 50k, Std: 15k
purchase_freq = np.random.poisson(3, n_customers)  # Avg 3 purchases/month

# Simulate churn: likely to churn if income < 40 and purchase_freq < 3
churn = ((income < 40) & (purchase_freq < 3)).astype(int)

# Create DataFrame
df_customers = pd.DataFrame({
    'Age': ages,
    'Gender': genders,
    'Income': income,
    'PurchaseFrequency': purchase_freq,
    'Churn': churn
})
# Save to CSV
df_customers.to_csv("customers.csv", index=False)

print("Customer data created and saved to 'customers.csv'")
print(df_customers.head())


In [None]:



import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df = pd.read_csv("customers.csv")

# Show top rows
df.head()

# Churn distribution
sns.countplot(data=df, x='Churn')
plt.title("Customer Churn Distribution")
plt.show()

# Correlation matrix
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# New sample customer
new_customer = pd.DataFrame({
    'Age': [28],
    'Gender': [1],
    'Income': [37],
    'PurchaseFrequency': [2]
})

# Predict churn
prediction = model.predict(new_customer)[0]

# Display prediction result with label
if prediction == 1:
    print("⚠ This customer is likely to CHURN. Consider offering retention benefits.")
else:
    print("This customer is likely to STAY. Continue engagement as usual.")
# 📌 Logistic Regression Mathematics Explained

# Logistic Regression predicts probability of churn using this formula:
# P(y=1) = 1 / (1 + e^-(b0 + b1*x1 + b2*x2 + ... + bn*xn))



# Let's take one customer example:
# Features: Age, Monthly Balance, Number of Transactions
age = 45             # x1
balance = 25000      # x2
transactions = 6     # x3

# Example coefficients learned by the model (for explanation)
b0 = -4              # Intercept
b1 = 0.05            # Coefficient for age
b2 = 0.0001          # Coefficient for balance
b3 = 0.3             # Coefficient for transactions

# Step 1: Calculate linear score (z)
z = b0 + (b1 * age) + (b2 * balance) + (b3 * transactions)
print("Linear score (z):", round(z, 4))





In [None]:
# Step 2: Apply sigmoid function to convert z to probability
probability = 1 / (1 + np.exp(-z))
print("Predicted probability of churn:", round(probability, 4))

# Step 3: Predict churn based on threshold 0.5
if probability > 0.5:
    print(" This customer is likely to CHURN.")
else:
    print(" This customer is likely to STAY.")


from sklearn.tree import DecisionTreeClassifier, plot_tree

clf = DecisionTreeClassifier(max_depth=6, random_state=42,criterion="entropy")
clf.fit(X_train, y_train)

# Plot the Decision Tree
plt.figure(figsize=(20, 10))
plot_tree(
    clf,
    feature_names=X.columns,
    class_names=['No Churn', 'Churn'],
    filled=True,
    rounded=True,
    fontsize=12
)
plt.title("Decision Tree for Customer Churn Prediction", fontsize=16)
plt.show()

feature_importance = pd.Series(clf.feature_importances_, index=X.columns)
sns.barplot(x=feature_importance.values, y=feature_importance.index)
plt.title("Feature Importance in Decision Tree")
plt.show()

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=4, random_state=42)
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 9: Plot feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
importances.sort_values().plot(kind='barh', figsize=(10,6))
plt.title("Feature Importances in Customer Classification")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()