In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Step 1: Generate Synthetic Data
np.random.seed(42)

n_customers = 30000  # Number of customers
data = pd.DataFrame({
    'total_rech_amt_6': np.random.randint(50, 500, size=n_customers),
    'total_rech_amt_7': np.random.randint(50, 500, size=n_customers),
    'total_ic_mou_9': np.random.choice([0, np.random.randint(50, 300)], size=n_customers, p=[0.1, 0.9]),
    'total_og_mou_9': np.random.choice([0, np.random.randint(50, 300)], size=n_customers, p=[0.1, 0.9]),
    'vol_2g_mb_9': np.random.choice([0, np.random.randint(10, 100)], size=n_customers, p=[0.1, 0.9]),
    'vol_3g_mb_9': np.random.choice([0, np.random.randint(10, 100)], size=n_customers, p=[0.1, 0.9])
})

# Step 2: Define High-Value Customers
data['avg_rech_amt_6_7'] = (data['total_rech_amt_6'] + data['total_rech_amt_7']) / 2
high_value_threshold = data['avg_rech_amt_6_7'].quantile(0.7)
high_value_customers = data[data['avg_rech_amt_6_7'] >= high_value_threshold]

# Step 3: Tag Churners
high_value_customers['churn'] = np.where(
    (high_value_customers['total_ic_mou_9'] == 0) &
    (high_value_customers['total_og_mou_9'] == 0) &
    (high_value_customers['vol_2g_mb_9'] == 0) &
    (high_value_customers['vol_3g_mb_9'] == 0), 1, 0
)

# Remove attributes of churn phase (Month 9)
columns_to_remove = ['total_ic_mou_9', 'total_og_mou_9', 'vol_2g_mb_9', 'vol_3g_mb_9']
data_for_modeling = high_value_customers.drop(columns=columns_to_remove)

# Step 4: Prepare Data for Modeling
X = data_for_modeling.drop(columns=['churn'])
y = data_for_modeling['churn']

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Step 5: Build and Evaluate Models
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluate Models
print("Logistic Regression Evaluation")
print(classification_report(y_test, y_pred_logreg))
print("ROC AUC:", roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1]))

print("\nRandom Forest Evaluation")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

# Feature Importance from Random Forest
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nTop Features Based on Random Forest:")
print(feature_importance.head(10))

# Plot Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'][:10], feature_importance['Importance'][:10])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Features by Importance')
plt.gca().invert_yaxis()
plt.show()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_value_customers['churn'] = np.where(


ValueError: The target 'y' needs to have more than 1 class. Got 1 class instead