# Data Loading and Initial Exploration
Load the customer data and perform initial EDA to understand the available features, missing values, and data distributions.

In [None]:
# Data Loading and Initial Exploration

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the customer data
data = pd.read_csv('customer_data.csv')

# Display the first few rows of the dataset
data.head()

# Display basic information about the dataset
data.info()

# Check for missing values
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0])

# Summary statistics of the dataset
data.describe()

# Plot distributions of numerical features
numerical_features = data.select_dtypes(include=['int64', 'float64']).columns
data[numerical_features].hist(bins=15, figsize=(15, 10), layout=(5, 3))
plt.tight_layout()
plt.show()

# Plot distributions of categorical features
categorical_features = data.select_dtypes(include=['object']).columns
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.countplot(data[feature])
    plt.title(f'Distribution of {feature}')
    plt.xticks(rotation=45)
    plt.show()

# Define Churn
Create a churn definition based on customer behavior patterns and implement it as a binary target variable.

In [None]:
# Define Churn

# Create a churn definition based on customer behavior patterns
# For this example, let's assume churn is defined as customers who have not made any transactions in the last 6 months

# Convert the 'last_transaction_date' to datetime
data['last_transaction_date'] = pd.to_datetime(data['last_transaction_date'])

# Define the churn threshold date (6 months from the most recent transaction date in the dataset)
churn_threshold_date = data['last_transaction_date'].max() - pd.DateOffset(months=6)

# Create the churn target variable
data['churn'] = (data['last_transaction_date'] < churn_threshold_date).astype(int)

# Display the first few rows to verify the churn definition
data[['customer_id', 'last_transaction_date', 'churn']].head()

# Plot the distribution of the churn variable
plt.figure(figsize=(8, 5))
sns.countplot(data['churn'])
plt.title('Distribution of Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

# Feature Engineering
Create relevant features from the raw data, including customer behavior metrics, transaction patterns, and temporal features.

In [None]:
# Feature Engineering

# Create relevant features from the raw data

# Feature 1: Total number of transactions
data['total_transactions'] = data.groupby('customer_id')['transaction_amount'].transform('count')

# Feature 2: Total transaction amount
data['total_transaction_amount'] = data.groupby('customer_id')['transaction_amount'].transform('sum')

# Feature 3: Average transaction amount
data['average_transaction_amount'] = data.groupby('customer_id')['transaction_amount'].transform('mean')

# Feature 4: Number of days since last transaction
data['days_since_last_transaction'] = (data['last_transaction_date'].max() - data['last_transaction_date']).dt.days

# Feature 5: Number of unique transaction types
data['unique_transaction_types'] = data.groupby('customer_id')['transaction_type'].transform('nunique')

# Feature 6: Number of transactions in the last month
last_month_date = data['last_transaction_date'].max() - pd.DateOffset(months=1)
data['transactions_last_month'] = data[data['last_transaction_date'] >= last_month_date].groupby('customer_id')['transaction_amount'].transform('count')

# Fill NaN values with 0 for transactions_last_month
data['transactions_last_month'].fillna(0, inplace=True)

# Drop duplicate rows to keep one row per customer
data = data.drop_duplicates(subset='customer_id')

# Display the first few rows to verify the new features
data[['customer_id', 'total_transactions', 'total_transaction_amount', 'average_transaction_amount', 'days_since_last_transaction', 'unique_transaction_types', 'transactions_last_month']].head()

# Plot the distribution of the new features
new_features = ['total_transactions', 'total_transaction_amount', 'average_transaction_amount', 'days_since_last_transaction', 'unique_transaction_types', 'transactions_last_month']
data[new_features].hist(bins=15, figsize=(15, 10), layout=(3, 2))
plt.tight_layout()
plt.show()

# Outlier Detection
Implement unsupervised learning methods to detect outliers that might require separate handling in the prediction system.

In [None]:
# Outlier Detection

# Import necessary libraries for outlier detection
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Select features for outlier detection
features_for_outlier_detection = ['total_transactions', 'total_transaction_amount', 'average_transaction_amount', 'days_since_last_transaction', 'unique_transaction_types', 'transactions_last_month']

# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data[features_for_outlier_detection])

# Initialize the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fit the model and predict outliers
outlier_predictions = iso_forest.fit_predict(scaled_features)

# Add the outlier predictions to the dataset
data['outlier_score'] = outlier_predictions

# Plot the distribution of outlier scores
plt.figure(figsize=(8, 5))
sns.countplot(data['outlier_score'])
plt.title('Distribution of Outlier Scores')
plt.xticks([0, 1], ['Inliers', 'Outliers'])
plt.show()

# Display the first few rows to verify the outlier scores
data[['customer_id', 'outlier_score']].head()

# Model Training Pipeline
Build a training pipeline using a single model architecture (e.g., XGBoost) with proper cross-validation and hyperparameter tuning.

In [None]:
# Model Training Pipeline

# Import necessary libraries for model training
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Select features and target variable
features = ['total_transactions', 'total_transaction_amount', 'average_transaction_amount', 'days_since_last_transaction', 'unique_transaction_types', 'transactions_last_month']
target = 'churn'

X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=2)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("ROC AUC Score:")
print(roc_auc_score(y_test, y_pred_proba))

# Plot the feature importance
plt.figure(figsize=(10, 6))
plt.barh(features, best_model.feature_importances_)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance of the XGBoost Model')
plt.show()

# Rule-Based System
Develop business rules for clear-cut churn cases that don't require model prediction.

In [None]:
# Rule-Based System

# Define business rules for clear-cut churn cases
# Rule 1: Customers with no transactions in the last 12 months are considered churned
data['rule_no_transactions_12_months'] = (data['days_since_last_transaction'] > 365).astype(int)

# Rule 2: Customers with total transaction amount less than $100 in the last 12 months are considered churned
data['rule_low_transaction_amount'] = ((data['total_transaction_amount'] < 100) & (data['days_since_last_transaction'] <= 365)).astype(int)

# Rule 3: Customers with only one type of transaction in the last 12 months are considered churned
data['rule_single_transaction_type'] = ((data['unique_transaction_types'] == 1) & (data['days_since_last_transaction'] <= 365)).astype(int)

# Combine the rules to create a final rule-based churn prediction
data['rule_based_churn'] = data[['rule_no_transactions_12_months', 'rule_low_transaction_amount', 'rule_single_transaction_type']].max(axis=1)

# Display the first few rows to verify the rule-based churn predictions
data[['customer_id', 'rule_no_transactions_12_months', 'rule_low_transaction_amount', 'rule_single_transaction_type', 'rule_based_churn']].head()

# Plot the distribution of the rule-based churn variable
plt.figure(figsize=(8, 5))
sns.countplot(data['rule_based_churn'])
plt.title('Distribution of Rule-Based Churn')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

# Model Orchestration
Create a system to orchestrate between rule-based predictions and model predictions based on outlier scores.

In [None]:
# Model Orchestration

# Define a function to orchestrate between rule-based predictions and model predictions based on outlier scores
def orchestrate_predictions(row, model, threshold=-0.5):
    if row['outlier_score'] >= threshold:
        return row['rule_based_churn']
    else:
        return model.predict(pd.DataFrame([row[features]]))[0]

# Apply the orchestration function to the dataset
data['final_churn_prediction'] = data.apply(orchestrate_predictions, axis=1, model=best_model)

# Display the first few rows to verify the final churn predictions
data[['customer_id', 'rule_based_churn', 'outlier_score', 'final_churn_prediction']].head()

# Plot the distribution of the final churn predictions
plt.figure(figsize=(8, 5))
sns.countplot(data['final_churn_prediction'])
plt.title('Distribution of Final Churn Predictions')
plt.xticks([0, 1], ['Not Churned', 'Churned'])
plt.show()

# Model Evaluation
Evaluate the complete system using appropriate metrics and analyze feature importance for explainability.

In [None]:
# Model Evaluation

# Import necessary libraries for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

# Evaluate the final churn predictions
y_true = data[target]
y_pred_final = data['final_churn_prediction']

# Calculate evaluation metrics
accuracy = accuracy_score(y_true, y_pred_final)
precision = precision_score(y_true, y_pred_final)
recall = recall_score(y_true, y_pred_final)
f1 = f1_score(y_true, y_pred_final)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Calculate ROC curve and AUC
fpr, tpr, _ = roc_curve(y_true, y_pred_final)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

# Analyze feature importance for explainability
plt.figure(figsize=(10, 6))
plt.barh(features, best_model.feature_importances_)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Feature Importance of the XGBoost Model')
plt.show()