In [1]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np

warnings.filterwarnings('ignore')

# Task №1

In [2]:
def bank_customer():
    bank_df = pd.read_csv("Bank Customer Churn Prediction.csv")
    return bank_df
data = bank_customer()
print(data.shape)
data.head(5)

(10000, 12)


Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
bank_df = data.copy()
bank_df.isna().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [4]:
bank_df.country.value_counts()
bank_df.gender.value_counts()

gender
Male      5457
Female    4543
Name: count, dtype: int64

In [5]:
bank_df = data.copy()
bank_df.isna().sum()

customer_id         0
credit_score        0
country             0
gender              0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [6]:
# country = pd.get_dummies(bank_df.country, prefix='country')
# for i in country:
#     country[i] = country[i].map({True: 1, False: 0})
# bank_data = pd.concat((bank_df, country), axis=1)
# bank_data = bank_data.drop(['country'], axis=1)

# bank_data.gender = (bank_data.gender=='Male').astype(int)

In [7]:
bank_df.head(3)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [8]:
X = bank_df.drop(['churn'], axis=1)
y = bank_df['churn']

In [9]:
corr_table = bank_df.drop(['country', 'gender'], axis=1).corr()
corr_table.churn.abs().sort_values(ascending=False)

churn               1.000000
age                 0.285323
active_member       0.156128
balance             0.118533
products_number     0.047820
credit_score        0.027094
tenure              0.014001
estimated_salary    0.012097
credit_card         0.007138
customer_id         0.006248
Name: churn, dtype: float64

In [10]:
bank_df.churn.value_counts(normalize=True)*100

churn
0    79.63
1    20.37
Name: proportion, dtype: float64

# Task №2

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
cat_features = ['country', 'gender', 'credit_card', 'active_member']

train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

In [13]:
model = CatBoostClassifier(random_state=123, 
                           depth=5,
                           iterations=100, 
                           eval_metric='AUC', 
                           verbose=0,
                           cat_features=cat_features)

model.fit(train_pool, eval_set=test_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x1899923ed20>

In [14]:
from sklearn.model_selection import cross_val_score

hoo_scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')

In [15]:
y_pred_proba_test = model.predict_proba(test_pool)[:, 1]
auc_test = roc_auc_score(y_test, y_pred_proba_test)
auc_test

0.8777802584431027

# Task №3

In [16]:
train_pool = Pool(data = X, label = y, cat_features = cat_features)

In [17]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

model = CatBoostClassifier(random_state=123, 
                           depth=5,
                           iterations=100, 
                           eval_metric='AUC', 
                           verbose=0, 
                           cat_features=cat_features)

kf_scores = cross_val_score(model, X, y, cv=kf, scoring='roc_auc')

In [18]:
kf_scores.mean()

0.8705817583518284

# Task №4

In [19]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
kfs_scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')

In [20]:
kfs_scores.mean()

0.8696879727384161

In [21]:
res = pd.DataFrame({
    'id': range(1, len(kf_scores)+1),
    'k-fold': kf_scores,
    'Stratified k-fold': kfs_scores
})
print(res) 
kf_scores.mean(), kfs_scores.mean()

   id    k-fold  Stratified k-fold
0   1  0.874493           0.869833
1   2  0.864582           0.874189
2   3  0.875892           0.866526
3   4  0.876988           0.872833
4   5  0.860954           0.865059


(0.8705817583518284, 0.8696879727384161)

стратифицированный подход применяется при дисбалансе классов и гарантирует что распределение классов в каждом фолде будет примерно таким же как и в исходном датасете. Это помогает избежать смещения которое может возникнуть если модель будет обучаться и тестироваться на фолдах с сильно отличающимся распределением классов.

# Task №5

In [28]:
from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()

loo_scores = cross_val_score(model, X, y, cv=loo, scoring='roc_auc')
loo_scores.mean(), loo_scores.std()

(nan, nan)

результат дает nan, а если указать n_splits=5, то выводит ошибку говоря что LeaveOneOut не принимает никаких аргументов (TypeError: LeaveOneOut() takes no arguments).

P.S: возможно я получаю nan из за того что мой ноут слабый => процесс очень долгий

# Task №6

In [27]:
result = pd.DataFrame({
    'Method': ['Hold-Out CV', 'K-Fold CV', 'Stratified K-Fold CV', 'Leave-One-Out CV'],
    'Mean': [hoo_scores.mean(), kf_scores.mean(), kfs_scores.mean(), loo_scores.mean()],
    'Std': [hoo_scores.std(), kf_scores.std(), kfs_scores.std(), loo_scores.std()]
})
result

Unnamed: 0,Method,Mean,Std
0,Hold-Out CV,0.869688,0.00351
1,K-Fold CV,0.870582,0.00653
2,Stratified K-Fold CV,0.869688,0.00351
3,Leave-One-Out CV,,


# Task №7

In [31]:
from sklearn.metrics import confusion_matrix, precision_score, f1_score
y_proba = y_pred_proba_test

thresholds = [0.05, 0.1, 0.15, 0.25, 0.3]

results_list = []

for threshold in thresholds:
    y_pred = (y_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results_list.append({
        'Threshold': threshold,
        'TPR': tpr,
        'FPR': fpr,
        'Precision': precision,
        'F1 Score': f1
    })

results_df = pd.DataFrame(results_list)
results_df

Unnamed: 0,Threshold,TPR,FPR,Precision,F1 Score
0,0.05,0.964377,0.670193,0.260302,0.409951
1,0.1,0.916031,0.397013,0.360721,0.517613
2,0.15,0.83715,0.275047,0.426719,0.565292
3,0.25,0.750636,0.132545,0.580709,0.654828
4,0.3,0.687023,0.107032,0.61086,0.646707
