In [1]:
# !pip install arff
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.io import arff
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
                
data = arff.loadarff("/kaggle/input/bank-marketing/phpkIxskf (1).arff")
df = pd.DataFrame(data[0])

/kaggle/input/bank-marketing/phpkIxskf (1).arff


In [2]:
obj = df.select_dtypes(include=("object"))

decode = obj.apply(lambda x: x.str.decode('utf8'))

decode.head()

Unnamed: 0,V2,V3,V4,V5,V7,V8,V9,V11,V16,Class
0,management,married,tertiary,no,yes,no,unknown,may,unknown,1
1,technician,single,secondary,no,yes,no,unknown,may,unknown,1
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,1
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,1
4,unknown,single,unknown,no,no,no,unknown,may,unknown,1


In [3]:
df = df.drop(columns = ["V2","V3","V4","V5","V7","V8","V9","V11","V16","Class"], axis = 1)

df = pd.concat([df, decode], axis = 1)

new_order = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'Class']

df = df.reindex(columns=new_order)

df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,Class
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,may,261.0,1.0,-1.0,0.0,unknown,1
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,may,151.0,1.0,-1.0,0.0,unknown,1
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,may,76.0,1.0,-1.0,0.0,unknown,1
3,47.0,blue-collar,married,unknown,no,1506.0,yes,no,unknown,5.0,may,92.0,1.0,-1.0,0.0,unknown,1
4,33.0,unknown,single,unknown,no,1.0,no,no,unknown,5.0,may,198.0,1.0,-1.0,0.0,unknown,1


In [4]:
# Sütunların yeninden adlandırılması

df.rename(columns={'V1': 'yas',
                   'V2': 'is_turu',
                   'V3': 'medeni_hal',
                   'V4': 'egitim',
                   'V5': 'odeme_durumu',
                   'V6': 'bakiye',
                   'V7': 'konut_kredi_durum',
                   'V8': 'bireysel_kredi_durum',
                   'V9': 'iletisim_turu',
                   'V10': 'son_iletisim_gunu',
                   'V11': 'son_iletisim_ayi',
                   'V12': 'sure_saniye',
                   'V13': 'kampanya',
                   'V14': 'gecen_gun_sayisi',
                   'V15': 'onceki_iletisim_sayisi',
                   'V16': 'onceki_kampanya_sonucu',
                   'Class': 'abone_oldu_mu'}, inplace=True)

# İş türü değişkenindeki alt kategorilerin yeniden isimlendirilmesi

df["is_turu"] = df["is_turu"].replace("admin.", "admin")
df["is_turu"] = df["is_turu"].replace("blue-collar", "bluecollar")
df["is_turu"] = df["is_turu"].replace("self-employed", "selfemployed")

In [5]:
# Numerik ve kategorik kolonların seçilmesi

num_cols = ["yas", "bakiye", "son_iletisim_gunu", "sure_saniye",
            "kampanya", "gecen_gun_sayisi", "onceki_iletisim_sayisi"]

cat_cols = ["is_turu", "medeni_hal", "egitim", "odeme_durumu",
            "konut_kredi_durum", "bireysel_kredi_durum", "iletisim_turu",
            "son_iletisim_ayi", "onceki_kampanya_sonucu", "abone_oldu_mu"]

# Feature Engineering

In [6]:
## UNKNOWN SORUNUNUN ÇÖZÜLMESİ

# İş türü değişkenindeki unknown alt kategorilerinin mod ile doldurulması

mode_value = df[df['is_turu'] != 'unknown']['is_turu'].mode()[0]
df['is_turu'].replace('unknown', mode_value, inplace=True)

# Eğitim değişkenindeki unknown alt kategorilerinin mod ile doldurulması

mode_value = df[df['egitim'] != 'unknown']['egitim'].mode()[0]
df['egitim'].replace('unknown', mode_value, inplace=True)

# Unknown alt kategorisi yüksek olan değişkenlerin seçilmesi ve unknown alt kategorilerinin silinmesi

has_unk_cat = ["onceki_kampanya_sonucu", "iletisim_turu"]
df[has_unk_cat] = df[has_unk_cat].replace('unknown', np.nan)

# Silinen değerlerin rastgele doldurulmasıa

def fill_missing_values_randomly(df, column_name):
    non_missing_values = df[column_name].dropna().unique()
    df[column_name] = df[column_name].apply(
        lambda x: np.random.choice(non_missing_values) if pd.isnull(x) else x
    )

def fill_columns_randomly(df):
    columns_to_fill = has_unk_cat
    
    for column in columns_to_fill:
        fill_missing_values_randomly(df, column)

fill_columns_randomly(df)

## VERİ TÜRETME - EDA İÇİN

# Yaş kategori gruplama

def yas_aralik(yas):
    if 18 <= yas <= 30:
        return "Genç Yetişkin"
    elif 30 < yas <= 65:
        return "Yetişkin"
    else:
        return "Yaşlı"

df["yas_aralik"] = df["yas"].apply(yas_aralik)

# Bakiye kategori gruplama

q1 = df['bakiye'].quantile(0.25)
q2 = df['bakiye'].quantile(0.50)
q3 = df['bakiye'].quantile(0.75)

def bakiye_aralik(bakiye):
    if bakiye <= q1:
        return "Düşük Bakiye" # -8019, 72 arası
    elif q1 < bakiye <= q2: 
        return "Orta Bakiye" # 73, 448 arası
    elif q2 < bakiye <= q3:
        return "Yüksek Bakiye" # 449, 1428 arası
    elif q3 < bakiye:
        return "Çok Yüksek Bakiye" # 1429, 102127 arası

df["bakiye_aralik"] = df["bakiye"].apply(bakiye_aralik)

# Mesleğe göre çalışıyor veya çalışmıyor olarak gruplama

def calisma_durum(is_turu):
    if is_turu in ["unemployed", "student", "retired"]:
        return "Çalışmıyor"
    else:
        return "Çalışıyor"
    
df["calisma_durum"] = df["is_turu"].apply(calisma_durum)

# Gün sayısı -1 ise iletişime geçilmedi -1'den farklı ise iletişime geçildi olarak gruplama

def iletisim_durum(gecen_gun_sayisi):
    if gecen_gun_sayisi == -1:
        return "İletişime Geçilmedi"
    else:
        return "İletişime Geçildi"
    
df["iletisim_durum"] = df["gecen_gun_sayisi"].apply(iletisim_durum)

#cat_cols.extend(["yas_aralik", "bakiye_aralik", "calisma_durum", "iletisim_durum"])

df = df.drop(["yas_aralik", "bakiye_aralik", "calisma_durum", "iletisim_durum"], axis=1)

# Data Preprocessing

In [7]:
                                    # ENCODING

#--------------------------------------------------------------------------------------

label_encode = ["egitim"]

label_encoder = LabelEncoder()

for col in label_encode:
    df[col] = label_encoder.fit_transform(df[col])

#--------------------------------------------------------------------------------------

is_turu_encode = {'bluecollar':1, 'management':2, 'technician':3, 'admin':4,
                'services':5, 'retired':6, 'selfemployed':7, 'entrepreneur':8,
                'unemployed':9, 'housemaid':10, 'student':11}

df['is_turu'] = df['is_turu'].map(is_turu_encode)

#--------------------------------------------------------------------------------------

medeni_hal_encode = {'married':1, 'single':2, 'divorced':3}

df['medeni_hal'] = df['medeni_hal'].map(medeni_hal_encode)

#--------------------------------------------------------------------------------------

odeme_durumu_encode = {'no':1, 'yes':2}

df['odeme_durumu'] = df['odeme_durumu'].map(odeme_durumu_encode)

#--------------------------------------------------------------------------------------

konut_kredi_durum_encode = {'no':1, 'yes':2}

df['konut_kredi_durum'] = df['konut_kredi_durum'].map(konut_kredi_durum_encode)

#--------------------------------------------------------------------------------------

bireysel_kredi_durum_encode = {'no':1, 'yes':2}

df['bireysel_kredi_durum'] = df['bireysel_kredi_durum'].map(bireysel_kredi_durum_encode)

#--------------------------------------------------------------------------------------

iletisim_turu_encode = {'cellular':1, 'telephone':2}

df['iletisim_turu'] = df['iletisim_turu'].map(iletisim_turu_encode)

#--------------------------------------------------------------------------------------

son_iletisim_ayi_encode = {'jan':1, 'feb':2, 'mar':3, 'apr':4,
                           'may':5, 'jun':6, 'jul':7, 'aug':8,
                           'sep':9, 'oct':10, 'nov':11, 'dec':12}

df['son_iletisim_ayi'] = df['son_iletisim_ayi'].map(son_iletisim_ayi_encode)

#--------------------------------------------------------------------------------------

onceki_kampanya_sonucu_encode = {'other':1, 'failure':2, 'success':3}

df['onceki_kampanya_sonucu'] = df['onceki_kampanya_sonucu'].map(onceki_kampanya_sonucu_encode)

#--------------------------------------------------------------------------------------

#abone_oldu_mu? --> 'no':1, 'yes':2

#--------------------------------------------------------------------------------------

#yas_aralik_encode = {'Genç Yetişkin':1, 'Yetişkin':2, 'Yaşlı':3}

#df['yas_aralik'] = df['yas_aralik'].map(yas_aralik_encode)

#--------------------------------------------------------------------------------------

#calisma_durum_encode = {'Çalışmıyor':1, 'Çalışıyor':2}

#df['calisma_durum'] = df['calisma_durum'].map(calisma_durum_encode)

#--------------------------------------------------------------------------------------

#iletisim_durum_encode = {'İletişime Geçilmedi':1, 'İletişime Geçildi':2}

#df['iletisim_durum'] = df['iletisim_durum'].map(iletisim_durum_encode)

#--------------------------------------------------------------------------------------

In [8]:
# Aykırı değer probleminin çözülmesi

bounds = {}
for col in num_cols:
    q1 = df[col].quantile(0.10)
    q3 = df[col].quantile(0.90)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    bounds[col] = {'lower_bound': lower_bound, 'upper_bound': upper_bound}

# Her bir sayısal değişken için belirli aralık içindeki değerleri seçelim
#selected_values = {}
#for col in num_cols:
#    selected_values[col] = df[col][(df[col] >= bounds[col]['lower_bound']) & (df[col] <= bounds[col]['upper_bound'])]
    
# Belirli aralık dışındaki değerleri çıkaralım
for col in num_cols:
    df = df[(df[col] >= bounds[col]['lower_bound']) & (df[col] <= bounds[col]['upper_bound'])]

#print(df)

In [9]:
from sklearn.model_selection import train_test_split

y = df["abone_oldu_mu"]                  
X = df.drop(["abone_oldu_mu"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Standardization - Standart Scaler

continuous_cols = num_cols

scaler = StandardScaler()

X_train[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
X_test[continuous_cols] = scaler.transform(X_test[continuous_cols])

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(33325, 16)
(8332, 16)
(33325,)
(8332,)


# Lojistik Regresyon

In [None]:
# Lojistik Regresyon - Without Oversampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

log_reg = LogisticRegression(solver='sag', random_state=42)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['Log_Reg_WithoutOversampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Lojistik Regresyon - Random Over Sampler

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
RandomOverSampler = RandomOverSampler(random_state=42)
X_train_oversampler,y_train_oversampler = RandomOverSampler.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_oversampler))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

# Model Eğitim - Tahmin

log_reg = LogisticRegression(solver='sag', random_state=42)
log_reg.fit(X_train_oversampler, y_train_oversampler)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['Log_Reg_RandomOverSampler']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Lojistik Regresyon - Smote

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
SMOTE = SMOTE(random_state=42)
X_train_smote,y_train_smote= SMOTE.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_smote))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

log_reg = LogisticRegression(solver='sag', random_state=42)
log_reg.fit(X_train_smote, y_train_smote)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['Log_Reg_Smote']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Lojistik Regresyon - BorderLine Smote

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
BorderlineSMOTE = BorderlineSMOTE(random_state=42)
X_train_border,y_train_border= BorderlineSMOTE.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_border))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

log_reg = LogisticRegression(solver='sag', random_state=42)
log_reg.fit(X_train_border, y_train_border)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['Log_Reg_BorderLine_Smote']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Lojistik Regresyon - Random Undersampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Undersampling

print("Before undersampling: ",Counter(y_train))
RandomUnderSampler = RandomUnderSampler(random_state=42)
X_train_undersamp,y_train_undersamp= RandomUnderSampler.fit_resample(X_train,y_train)
print("After undersampling: ",Counter(y_train_undersamp))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

log_reg = LogisticRegression(solver='sag', random_state=42)
log_reg.fit(X_train_undersamp, y_train_undersamp)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['Log_Reg_RandomOversampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Lojistik Regresyon - TomekLinks

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Undersampling

print("Before undersampling: ",Counter(y_train))
TomekLinks = TomekLinks()
X_train_tomek, y_train_tomek = TomekLinks.fit_resample(X_train,y_train)
print("After undersampling: ",Counter(y_train_tomek))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

log_reg = LogisticRegression(solver='sag', random_state=42)
log_reg.fit(X_train_tomek, y_train_tomek)

y_pred = log_reg.predict(X_test)
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['Log_Reg_TomekLinks']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

# Destek Vektör Makinesi(SVM)

In [None]:
# Destek Vektör Makinesi - Without Sampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Model Eğitim - Tahmin

svc = SVC(C=10, gamma='auto', random_state=42, probability=True)
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['SVM_Without_Sampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Destek Vektör Makinesi - Random Over Sampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
RandomOverSampler = RandomOverSampler(random_state=42)
X_train_oversampler,y_train_oversampler = RandomOverSampler.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_oversampler))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

# Model Eğitim - Tahmin

svc = SVC(C=10, gamma='auto', random_state=42, probability=True)
svc.fit(X_train_oversampler, y_train_oversampler)

y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['SVM_Random_Over_Sampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Destek Vektör Makinesi - Smote

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
SMOTE = SMOTE(random_state=42)
X_train_smote,y_train_smote= SMOTE.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_smote))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

svc = SVC(C=10, gamma='auto', random_state=42, probability=True)
svc.fit(X_train_smote, y_train_smote)

y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['SVM_Smote']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Destek Vektör Makinesi - BorderLine Smote

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
BorderlineSMOTE = BorderlineSMOTE(random_state=42)
X_train_border,y_train_border= BorderlineSMOTE.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_border))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

svc = SVC(C=10, gamma='auto', random_state=42, probability=True)
svc.fit(X_train_border, y_train_border)

y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['SVM_BorderLine_Smote']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Destek Vektör Makinesi - Random Under Sampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Undersampling

print("Before undersampling: ",Counter(y_train))
RandomUnderSampler = RandomUnderSampler(random_state=42)
X_train_undersamp,y_train_undersamp= RandomUnderSampler.fit_resample(X_train,y_train)
print("After undersampling: ",Counter(y_train_undersamp))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

svc = SVC(C=10, gamma='auto', random_state=42, probability=True)
svc.fit(X_train_undersamp, y_train_undersamp)

y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['SVM_Random_Under_Sampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

In [None]:
# Destek Vektör Makinesi - TomekLinks

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Undersampling

print("Before undersampling: ",Counter(y_train))
TomekLinks = TomekLinks()
X_train_tomek, y_train_tomek = TomekLinks.fit_resample(X_train,y_train)
print("After undersampling: ",Counter(y_train_tomek))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

svc = SVC(C=10, gamma='auto', random_state=42, probability=True)
svc.fit(X_train_tomek, y_train_tomek)

y_pred = svc.predict(X_test)
y_pred_prob = svc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['SVM_TomekLinks']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

# Rastgele Orman(RF)

In [12]:
# Random Forest - Without Sampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Model Eğitim - Tahmin

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['RF_Without_Sampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

              precision    recall  f1-score   support

           1       0.93      0.98      0.95      7448
           2       0.66      0.39      0.49       884

    accuracy                           0.91      8332
   macro avg       0.79      0.68      0.72      8332
weighted avg       0.90      0.91      0.90      8332

- - - - - - - - - - - - - - - - - - - - - - - - - 


Unnamed: 0,Precision,Recall,F1-Score,Roc_Auc,Aucpr
RF_Without_Sampling,0.6571,0.388,0.4879,0.682,0.5815


In [13]:
# Random Forest - Random Over Sampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
RandomOverSampler = RandomOverSampler(random_state=42)
X_train_oversampler,y_train_oversampler = RandomOverSampler.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_oversampler))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

# Model Eğitim - Tahmin

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train_oversampler, y_train_oversampler)

y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['RF_Random_Over_Sampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

Before oversampling:  Counter({'1': 29689, '2': 3636})
After oversampling:  Counter({'1': 29689, '2': 29689})
- - - - - - - - - - - - - - - - - - - - - - - - - 
              precision    recall  f1-score   support

           1       0.94      0.96      0.95      7448
           2       0.59      0.49      0.53       884

    accuracy                           0.91      8332
   macro avg       0.76      0.72      0.74      8332
weighted avg       0.90      0.91      0.91      8332

- - - - - - - - - - - - - - - - - - - - - - - - - 


Unnamed: 0,Precision,Recall,F1-Score,Roc_Auc,Aucpr
RF_Random_Over_Sampling,0.589,0.4864,0.5328,0.7231,0.5719


In [14]:
# Random Forest - Smote

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
SMOTE = SMOTE(random_state=42)
X_train_smote,y_train_smote= SMOTE.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_smote))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train_smote, y_train_smote)

y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['RF_Smote']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

Before oversampling:  Counter({'1': 29689, '2': 3636})
After oversampling:  Counter({'1': 29689, '2': 29689})
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
              precision    recall  f1-score   support

           1       0.95      0.93      0.94      7448
           2       0.50      0.61      0.55       884

    accuracy                           0.89      8332
   macro avg       0.73      0.77      0.75      8332
weighted avg       0.90      0.89      0.90      8332

- - - - - - - - - - - - - - - - - - - - - - - - - 


Unnamed: 0,Precision,Recall,F1-Score,Roc_Auc,Aucpr
RF_Smote,0.5009,0.6109,0.5505,0.7693,0.5338


In [15]:
# Random Forest - BorderLine Smote

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Oversampling

print("Before oversampling: ",Counter(y_train))
BorderlineSMOTE = BorderlineSMOTE(random_state=42)
X_train_border,y_train_border= BorderlineSMOTE.fit_resample(X_train,y_train)
print("After oversampling: ",Counter(y_train_border))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train_border, y_train_border)

y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['RF_BorderLine_Smote']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

Before oversampling:  Counter({'1': 29689, '2': 3636})
After oversampling:  Counter({'1': 29689, '2': 29689})
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
              precision    recall  f1-score   support

           1       0.95      0.93      0.94      7448
           2       0.51      0.62      0.56       884

    accuracy                           0.90      8332
   macro avg       0.73      0.77      0.75      8332
weighted avg       0.91      0.90      0.90      8332

- - - - - - - - - - - - - - - - - - - - - - - - - 


Unnamed: 0,Precision,Recall,F1-Score,Roc_Auc,Aucpr
RF_BorderLine_Smote,0.5056,0.6165,0.5556,0.7725,0.5301


In [16]:
# Random Forest - Random Under Sampling

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Undersampling

print("Before undersampling: ",Counter(y_train))
RandomUnderSampler = RandomUnderSampler(random_state=42)
X_train_undersamp,y_train_undersamp= RandomUnderSampler.fit_resample(X_train,y_train)
print("After undersampling: ",Counter(y_train_undersamp))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train_undersamp, y_train_undersamp)

y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['RF_Random_Under_Sampling']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

Before undersampling:  Counter({'1': 29689, '2': 3636})
After undersampling:  Counter({'1': 3636, '2': 3636})
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
              precision    recall  f1-score   support

           1       0.98      0.82      0.89      7448
           2       0.37      0.88      0.52       884

    accuracy                           0.83      8332
   macro avg       0.67      0.85      0.71      8332
weighted avg       0.92      0.83      0.85      8332

- - - - - - - - - - - - - - - - - - - - - - - - - 


Unnamed: 0,Precision,Recall,F1-Score,Roc_Auc,Aucpr
RF_Random_Under_Sampling,0.3668,0.8801,0.5178,0.8499,0.5414


In [17]:
# Random Forest - TomekLinks

accuracy= []
recall =[]
roc_auc= []
precision = []
aucpr = []
f1 = []

# Undersampling

print("Before undersampling: ",Counter(y_train))
TomekLinks = TomekLinks()
X_train_tomek, y_train_tomek = TomekLinks.fit_resample(X_train,y_train)
print("After undersampling: ",Counter(y_train_tomek))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")

# Model Eğitim - Tahmin

rfc = RandomForestClassifier(random_state=42, n_estimators=100)
rfc.fit(X_train_tomek, y_train_tomek)

y_pred = rfc.predict(X_test)
y_pred_prob = rfc.predict_proba(X_test)[:, 1]

# Model Metrikleri

print(classification_report(y_test, y_pred))
print("- - - - - - - - - - - - - - - - - - - - - - - - - ")

precision.append(round(precision_score(y_test, y_pred, pos_label='2'),4))
recall.append(round(recall_score(y_test, y_pred, pos_label='2'),4))
f1.append(round(f1_score(y_test, y_pred, pos_label='2'),4))
aucpr.append(round(average_precision_score(y_test, y_pred_prob, pos_label='2'),4))
roc_auc.append(round(roc_auc_score(y_test, y_pred),4))

model_names = ['RF_TomekLinks']

result_model = pd.DataFrame({'Precision':precision, 'Recall':recall, 'F1-Score':f1, 'Roc_Auc':roc_auc, 'Aucpr':aucpr}, index=model_names)

result_model

Before undersampling:  Counter({'1': 29689, '2': 3636})
After undersampling:  Counter({'1': 28916, '2': 3636})
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
              precision    recall  f1-score   support

           1       0.94      0.97      0.95      7448
           2       0.64      0.44      0.52       884

    accuracy                           0.91      8332
   macro avg       0.79      0.71      0.74      8332
weighted avg       0.90      0.91      0.91      8332

- - - - - - - - - - - - - - - - - - - - - - - - - 


Unnamed: 0,Precision,Recall,F1-Score,Roc_Auc,Aucpr
RF_TomekLinks,0.6374,0.4434,0.523,0.7067,0.5865
