In [3]:
from base64 import standard_b64decode

import pandas as pd
import numpy as np

df = pd.read_csv("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## 1. Data Preparation

Bu bölümde modelleme öncesinde gerekli veri temizleme ve hedef değişken dönüşümleri yapılacaktır.

In [4]:
df_model = df.copy()
df_model.shape

(7043, 21)

In [17]:
# TotalCharges sayısallaştırıldı ve eksik değerler dolduruldu
df_model['TotalCharges'] = pd.to_numeric(
    df_model['TotalCharges'], errors='coerce'
)
df_model['TotalCharges'].isna().sum()

np.int64(0)

In [6]:
df_model['TotalCharges'] = df_model['TotalCharges'].fillna(
    df_model['TotalCharges'].median(),
)
df_model['TotalCharges'].head(10)

0      29.85
1    1889.50
2     108.15
3    1840.75
4     151.65
5     820.50
6    1949.40
7     301.90
8    3046.05
9    3487.95
Name: TotalCharges, dtype: float64

In [7]:
#churn 0/1 çevirme map ile

df_model["Churn"] = df_model["Churn"].map({"Yes": 1, "No": 0})
df_model["Churn"].value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [8]:
df_model["Churn"].isna().sum()

np.int64(0)

In [19]:
cat_cols = df_model.select_dtypes(include="object").columns

In [13]:
#kategorik ve sayısal değişkenleri ayırma
cat_cols = df_model.select_dtypes(include="object").columns
num_cols = df_model.select_dtypes(include=["int64", "float64"]).columns
cat_cols, num_cols

(Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
        'PaperlessBilling', 'PaymentMethod'],
       dtype='object'),
 Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn'], dtype='object'))

In [20]:
#kategorik(string) için one-hot encoding
df_model_encoded = pd.get_dummies(
    df_model,
    columns=cat_cols,
    drop_first=True
)
df_model_encoded.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,False,True,False,False,True,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.5,0,True,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,True,False,False,True,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.3,1840.75,0,True,False,False,False,True,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.7,151.65,1,False,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False


In [21]:
X = df_model_encoded.drop("Churn", axis=1)
y = df_model_encoded["Churn"]
X.shape, y.shape

((7043, 30), (7043,))

In [22]:
#train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

In [24]:
#standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_cols = ["tenure","MonthlyCharges","TotalCharges"]

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])