In [4]:
import pandas as pd
df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df_model = df.copy()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
df_model = df_model.drop("customerID", axis=1)

In [9]:
df_model["TotalCharges"] = pd.to_numeric(df_model["TotalCharges"], errors="coerce")
df_model["TotalCharges"] = df_model["TotalCharges"].fillna(df_model["TotalCharges"].median())

In [10]:
df_model["Churn"] = df_model["Churn"].map({"Yes": 1,"No": 0})

In [11]:
cat_cols = df_model.select_dtypes(include="object").columns
num_cols = df_model.select_dtypes(include=["int64","float64"]).columns

In [12]:
df_model_encoded = pd.get_dummies(df_model, columns=cat_cols, drop_first=True)

In [13]:
X = df_model_encoded.drop('Churn', axis=1)
y = df_model_encoded['Churn']

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler =StandardScaler()
num_cols = ["tenure","MonthlyCharges","TotalCharges"]
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:,1]

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[924 111]
 [187 187]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.63      0.50      0.56       374

    accuracy                           0.79      1409
   macro avg       0.73      0.70      0.71      1409
weighted avg       0.78      0.79      0.78      1409


ROC-AUC: 0.8237567490764421


In [19]:
threshold = 0.40
y_pred_thr = (y_proba > threshold).astype(int)
print("Threshold:", threshold)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_thr))

Threshold: 0.4
Confusion Matrix:
 [[869 166]
 [148 226]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85      1035
           1       0.58      0.60      0.59       374

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.78      0.78      0.78      1409



In [18]:
threshold = 0.35
y_pred_thr = (y_proba > threshold).astype(int)
print("Threshold:", threshold)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_thr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_thr))

Threshold: 0.35
Confusion Matrix:
 [[827 208]
 [121 253]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.80      0.83      1035
           1       0.55      0.68      0.61       374

    accuracy                           0.77      1409
   macro avg       0.71      0.74      0.72      1409
weighted avg       0.79      0.77      0.77      1409



**Threshold Tuning Sonuçları (Random Forest):**

- Varsayılan eşik (0.50) churn sınıfı için %50 recall sağlamıştır.
- Eşik 0.40’a düşürüldüğünde churn recall %60’a yükselmiştir.
- Eşik 0.35’e düşürüldüğünde churn recall %68 seviyesine ulaşmıştır.

İş problemi gereği churn müşterilerini kaçırmanın maliyeti,
yanlış pozitif tahmin maliyetinden daha yüksek olduğundan,
final model için threshold = 0.35 seçilmiştir.