Data cleaning

In [None]:
import pandas as pd
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

Data preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop('Churn',axis=1)
y = df['Churn']

In [None]:
customer_ids = x['customerID']
x = x.drop('customerID', axis=1)

In [None]:
y = y.map({'Yes' : 1, 'No' : 0})

In [None]:
bc = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in bc:
  x[col] = x[col].map({'Yes':1, 'No':0})

In [None]:
cc = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaymentMethod']
x = pd.get_dummies(x, columns=cc, drop_first=True)

In [None]:
nc = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
x[nc] = scaler.fit_transform(x[nc])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state = 42)

Modeling

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_curve, roc_auc_score

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

In [None]:
xgb_model.fit(x_train, y_train)

In [None]:
y_pred = xgb_model.predict(x_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Evaluation

In [None]:
y_pred_proba = xgb_model.predict_proba(x_test)[:, 1]

In [None]:
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc_score(y_test, y_pred_proba):.2f})")
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()

Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
xgb_base = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'scale_pos_weight': [1, (y_train.value_counts()[0] / y_train.value_counts()[1])]
}

In [None]:
grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='recall',   # نركز على recall للفئة الإيجابية (اللي غادروا)
    cv=3,
    verbose=1,
    n_jobs=-1
)

In [None]:
grid_search.fit(x_train, y_train)

In [None]:
print("best params", grid_search.best_params_)
print("best Recall Score:", grid_search.best_score_)

In [None]:
xgb_best = XGBClassifier(
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    **grid_search.best_params_
)
xgb_best.fit(x_train, y_train)

In [None]:
y_pred_best = xgb_best.predict(x_test)
y_pred_best_proba = xgb_best.predict_proba(x_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_best_proba))