<a href="https://colab.research.google.com/github/arunak451/project/blob/main/customer_churn_prdiction_using_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

2. Load and Explore the Data

In [None]:
# Load the dataset
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Display basic info
print(data.info())
print(data.head())

# Check for missing values
print(data.isnull().sum())

# Basic statistics
print(data.describe())

3. Data Preprocessing

In [None]:
# Drop customer ID as it's not useful for prediction
data.drop('customerID', axis=1, inplace=True)

# Convert TotalCharges to numeric (it's loaded as object because of empty strings)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Fill missing values (only in TotalCharges)
data['TotalCharges'].fillna(data['TotalCharges'].median(), inplace=True)

# Convert Churn to binary (0/1)
data['Churn'] = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# Separate categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.drop('Churn')

# One-hot encode categorical variables
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Check class distribution
print(data['Churn'].value_counts(normalize=True))

# Visualize class imbalance
sns.countplot(x='Churn', data=data)
plt.title('Class Distribution')
plt.show()

4. Feature Engineering and Splitting Data

In [None]:
# Separate features and target
X = data.drop('Churn', axis=1)
y = data['Churn']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

5. Model Building and Evaluation

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Classification report
    cr = classification_report(y_test, y_pred)

    # Plot confusion matrix
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Churn', 'Churn'],
                yticklabels=['Not Churn', 'Churn'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()

    # Print metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print("\nClassification Report:\n", cr)



Logistic Regression

In [None]:
# Logistic Regression
print("Logistic Regression:")
lr = LogisticRegression(random_state=42, max_iter=1000)
lr = evaluate_model(lr, X_train_smote, y_train_smote, X_test, y_test)

Random Forest

In [None]:
# Random Forest
print("\nRandom Forest:")
rf = RandomForestClassifier(random_state=42)
rf = evaluate_model(rf, X_train_smote, y_train_smote, X_test, y_test)

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
plt.title('Top 10 Important Features')
plt.show()

Gradient Boosting

In [None]:
# Gradient Boosting
print("\nGradient Boosting:")
gb = GradientBoostingClassifier(random_state=42)
gb = evaluate_model(gb, X_train_smote, y_train_smote, X_test, y_test)

6. Hyperparameter Tuning

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='roc_auc'
)

grid_search.fit(X_train_smote, y_train_smote)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Evaluate best model
best_rf = grid_search.best_estimator_
print("\nTuned Random Forest:")
best_rf = evaluate_model(best_rf, X_train_smote, y_train_smote, X_test, y_test)

7. Final Model Selection and Deployment

In [None]:
# Save the final model
import joblib

# Save model
joblib.dump(best_rf, 'churn_prediction_model.pkl')

# Save scaler
joblib.dump(scaler, 'scaler.pkl')

# To load and use the model later:
# model = joblib.load('churn_prediction_model.pkl')
# scaler = joblib.load('scaler.pkl')