# IMPORTS

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats as ss
from scipy.stats import skew

import warnings
from tqdm import tqdm
from pathlib import Path
from itertools import repeat

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

import lightgbm as lgbm
!pip install catboost
from catboost import CatBoostClassifier, Pool
from catboost.utils import get_roc_curve, select_threshold

pd.options.display.max_rows = 250
pd.options.display.max_columns = 250
pd.options.display.max_colwidth = 25
warnings.filterwarnings("ignore")

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


# KAGGLE

In [None]:
! pip install -q kaggle
! kaggle competitions download -c 'anadolu-hayat-emeklilik-datathon-2024'

Downloading anadolu-hayat-emeklilik-datathon-2024.zip to /content
 98% 91.0M/92.5M [00:02<00:00, 42.5MB/s]
100% 92.5M/92.5M [00:02<00:00, 34.1MB/s]


In [None]:
!mkdir "AHE_datathon"
!cd "AHE_datathon"

mkdir: cannot create directory ‘AHE_datathon’: File exists


In [None]:
! unzip "anadolu-hayat-emeklilik-datathon-2024.zip" -d AHE_datathon

Archive:  anadolu-hayat-emeklilik-datathon-2024.zip
  inflating: AHE_datathon/Urun Ozellikleri.xlsx  
  inflating: AHE_datathon/sample_submission.csv  
  inflating: AHE_datathon/test.csv   
  inflating: AHE_datathon/train.csv  


# DATA

In [None]:
train_path = "/content/AHE_datathon/train.csv"
test_path = "/content/AHE_datathon/test.csv"

In [None]:
train_df_main = pd.read_csv(train_path, low_memory=False)
test_df = pd.read_csv(test_path, low_memory=False)

In [None]:
#ozellik = pd.read_excel("/kaggle/input/anadolu-hayat-emeklilik-datathon-2024/Urun Ozellikleri.xlsx")
submission = pd.read_csv("/content/AHE_datathon/sample_submission.csv", low_memory=False)

In [None]:
train_df = train_df_main

In [None]:
label = "LABEL"

# FEATURES

In [None]:
cat_cols = ['FLAG',
 'SORU_YATIRIM_KARAKTERI_CVP',
 'SORU_MEDENI_HAL_CVP',
 'SORU_EGITIM_CVP',
 'PP_CINSIYET',
 'PP_MUSTERI_SEGMENTI',
 'PP_UYRUK',
 "PP_MESLEK",
 'IL']

num_cols = ['PP_YAS',
 'SORU_YATIRIM_KARAKTERI_RG',
 'SORU_MEDENI_HAL_RG',
 'SORU_EGITIM_RG',
 'SORU_GELIR_RG',
 'SORU_GELIR_CVP',
 'SORU_COCUK_SAYISI_CVP',
 'SORU_COCUK_SAYISI_RG',
 'VADE_TUTAR_0',
 'ODEME_TUTAR_0',
 'VADE_TUTAR_1',
 'ODEME_TUTAR_1',
 'VADE_TUTAR_2',
 'ODEME_TUTAR_2',
 'VADE_TUTAR_3',
 'ODEME_TUTAR_3',
 'VADE_TUTAR_4',
 'ODEME_TUTAR_4',
 'VADE_TUTAR_5',
 'ODEME_TUTAR_5',
 'VADE_TUTAR_6',
 'ODEME_TUTAR_6',
 'VADE_TUTAR_7',
 'ODEME_TUTAR_7',
 'VADE_TUTAR_8',
 'ODEME_TUTAR_8',
 'VADE_TUTAR_9',
 'ODEME_TUTAR_9',
 'VADE_TUTAR_10',
 'ODEME_TUTAR_10',
 'VADE_TUTAR_11',
 'ODEME_TUTAR_11',
 'SON_AY_KATKI_MIKTARI',
 'SON_CEYREK_KATKI_MIKTARI',
 'SON_CEYREK_KATKI_ADET',
 'SON_SENE_KATKI_MIKTARI',
 'SON_SENE_KATKI_ADET',
 'ANAPARA',
 'GETIRI',
 'BU09',
 'HU17',
 'AKTIF_ILK_POLICE_RG',
 'BU01',
 'BU03',
 'BU05',
 'BU07',
 'BU10',
 'BU12',
 'BU14',
 'BU16',
 'BU18',
 'BU20',
 'BU22',
 'BU24',
 'HU02',
 'HU04',
 'HU06',
 'HU10',
 'HU12',
 'HU14',
 'HU16',
 'HU19',
 'BU02',
 'BU06',
 'BU11',
 'BU15',
 'BU19',
 'BU23',
 'HU03',
 'HU07',
 'HU13',
 'HU18',
 'BU04',
 'BU13',
 'BU21',
 'HU05',
 'HU15',
 'BU08',
 'HU01',
 'BU17',
 'HU11',
 'BES_AYRILMA_TALEP_ADET',
 'ODEMEME_TALEP_ADET',
 'HAYAT_AYRILMA_TALEP_ADET',
 'BILGI_TALEP_ADET',
 'SON_AY_KATKI_ADET']

# PREPROCESS

In [None]:
def preprocess_dataset(df_):

    df = df_.copy()

    df["PP_CINSIYET"] = np.where(df["PP_CINSIYET"] == 1, "ERKEK", "KADIN")

    df["SORU_GELIR_CVP"] = df["SORU_GELIR_CVP"].str.replace(",",".").astype(float)
    df.loc[df["SORU_GELIR_CVP"] >= 60000, "SORU_GELIR_CVP"] = np.nan
    df.loc[df["SORU_GELIR_CVP"] <= 500, "SORU_GELIR_CVP"] = np.nan

    df[["PP_MUSTERI_SEGMENTI", "PP_UYRUK", "IL"]] = df[["PP_MUSTERI_SEGMENTI", "PP_UYRUK", "IL"]].astype(str)



    row_cnt, col_cnt = df.shape

    cat_col_diff = ["baslangic_tarihi"]
    cat_col_union = []

    train_df.dtypes[train_df.dtypes == "object"].index

    id_cols = (
        (df.nunique() == row_cnt).to_frame("is_matched").query("is_matched").index
    ).union(df.columns.str.contains("|".join("MUSTERI_ID")))

    #cat_cols = df.dtypes[df.dtypes == "object"].index.difference(
    #    id_cols.union(cat_col_diff)
    #)

    df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna("UNKNOWN").astype("category")
    return df

# FEATURE EXTRACTION

In [None]:
def increase_features(df, cols, groupname=""):
    comparison_df = pd.DataFrame(
        np.argwhere(df[cols[1:]].values > df[cols[:-1]].values),
        columns=["index", "monthnum"],
    )
    df[groupname + "_increase_month_count"] = comparison_df.groupby("index")[
        "monthnum"
    ].count()
    df[groupname + "_increase_month_count"].fillna(0, inplace=True)
    df[groupname + "_increase_last_month"] = comparison_df.groupby("index")[
        "monthnum"
    ].max()
    df[groupname + "_increase_last_month"].fillna(-1, inplace=True)

def extract_tutar_stats(df, cols=[], groupname="None"):
    df[groupname + "_max"] = df[cols].max(axis=1)
    df[groupname + "_std"] = df[cols].std(axis=1)
    df[groupname + "_var"] = df[cols].var(axis=1)
    #df[groupname + "_mad"] = df[cols].mad(axis=1)
    df[groupname + "_mean"] = df[cols].mean(axis=1)
    df[groupname + "_sum"] = df[cols].sum(axis=1)
    df[groupname + "_min"] = df[cols].min(axis=1)
    df[groupname + "_q05"] = df[cols].quantile(0.05, axis=1)
    df[groupname + "_q95"] = df[cols].quantile(0.95, axis=1)
    df[groupname + "_q25"] = df[cols].quantile(0.25, axis=1)
    df[groupname + "_q75"] = df[cols].quantile(0.75, axis=1)
    #df[groupname+"_nunq"] = df[cols].nunique(axis=1)
    df[groupname + "_skew"] = df[cols].skew(axis=1)
    df[groupname + "_kurt"] = df[cols].kurt(axis=1)

# Determine if payment was disrupted
def check_disruption(row):
    for i in range(12):  # Assuming 12 periods from 0 to 11
        vade = row[f'VADE_TUTAR_{i}']
        odeme = row[f'ODEME_TUTAR_{i}']
        if vade > 0 and odeme == 0:
            return 0  # Disrupted
    return 1  # Not disrupted

In [None]:
def extract_features(df_):
    df = df_.copy()
    GELIR_bins = [
        -np.inf,
        1000.0,
        2000.0,
        3500.0,
        4000.0,
        5000.0,
        10000.0,
        15000.0,
        20000.0,
        30000.0,
        50000.0,
        75000.0,
        np.inf,
    ]

    ANAPARA_bins = [
        -np.inf,
        1000.0,
        2000.0,
        3500.0,
        4000.0,
        5000.0,
        8000.0,
        10000.0,
        30000.0,
        50000.0,
        75000.0,
        100000.0,
        200000.0,
        300000.0,
        400000.0,
        500000.0,
        600000.0,
        np.inf,
    ]
    GELIR_RG_BINS = [
        -np.inf,
        1,
        5,
        8,
        12,
        20,
        30,
        40,
        50,
        60,
        70,
        80,
        100,
        120,
        150,
        180,
        200,
        250,
        300,
        np.inf,
    ]



    df['total_vade'] = df.filter(like='VADE_TUTAR').sum(axis=1)
    df['total_odeme'] = df.filter(like='ODEME_TUTAR').sum(axis=1)



    df['disrupt'] = df.apply(check_disruption, axis=1)
    df.disrupt = df.disrupt.astype(object)


    odenen_tutar_cols = [col for col in df_.columns if "ODEME_TU" in col]
    vade_tutar_cols = [col for col in df_.columns if "VADE_TU" in col]

    #extract_month_year(df)

    df["GELIR_qcut"] = pd.cut(df.SORU_GELIR_CVP, GELIR_bins)
    df["GELIR_qcut"] = df["GELIR_qcut"].cat.codes.astype(str).fillna("UNK")

    df["ANAPARA_qcut"] = pd.cut(df.ANAPARA, ANAPARA_bins)
    df["ANAPARA_qcut"] = df["ANAPARA_qcut"].cat.codes.astype(str).fillna("UNK")

    df['GELIR_RG_qcut'] = pd.cut(df['SORU_GELIR_RG'], GELIR_RG_BINS)
    df["GELIR_RG_qcut"] = df["GELIR_RG_qcut"].cat.codes.astype(str).fillna("UNK")

    df['SORU_YATIRIM_KARAKTERI_RG_qcut'] = pd.cut(df['SORU_YATIRIM_KARAKTERI_RG'], GELIR_RG_BINS)
    df["SORU_YATIRIM_KARAKTERI_RG_qcut"] = df["SORU_YATIRIM_KARAKTERI_RG_qcut"].cat.codes.astype(str).fillna("UNK")

    df['SORU_MEDENI_HAL_RG_qcut'] = pd.cut(df['SORU_MEDENI_HAL_RG'], GELIR_RG_BINS)
    df["SORU_MEDENI_HAL_RG_qcut"] = df["SORU_MEDENI_HAL_RG_qcut"].cat.codes.astype(str).fillna("UNK")

    df['SORU_EGITIM_RG_qcut'] = pd.cut(df['SORU_EGITIM_RG'], GELIR_RG_BINS)
    df["SORU_EGITIM_RG_qcut"] = df["SORU_EGITIM_RG_qcut"].cat.codes.astype(str).fillna("UNK")

    df['SORU_COCUK_SAYISI_RG_qcut'] = pd.cut(df['SORU_COCUK_SAYISI_RG'], GELIR_RG_BINS)
    df["SORU_COCUK_SAYISI_RG_qcut"] = df["SORU_COCUK_SAYISI_RG_qcut"].cat.codes.astype(str).fillna("UNK")


    df["uyruk_il"] = df["PP_UYRUK"].astype(str) + "_" + df["IL"].astype(str)
    df["meslek_medenihal_egitim"] = (
        df["PP_MESLEK"].astype(str)
        + "_"
        + df["SORU_MEDENI_HAL_CVP"].astype(str)
        + "_"
        + df["SORU_EGITIM_CVP"].astype(str)
    )
    df["meslek_medenihal"] = (
        df["PP_MESLEK"].astype(str)
        + "_"
        + df["SORU_MEDENI_HAL_CVP"].astype(str)
    )

    df["gelir_medenihal_egitim"] = (
        df["GELIR_qcut"].astype(str)
        + "_"
        + df["SORU_MEDENI_HAL_CVP"].astype(str)
        + "_"
        + df["SORU_EGITIM_CVP"].astype(str)
    )

    df["yatirimkarakteri_gelir_anapara"] = (
        df["SORU_YATIRIM_KARAKTERI_CVP"].astype(str)
        + "_"
        + df["GELIR_qcut"].astype(str)
        + "_"
        + df["ANAPARA_qcut"].astype(str)
    )

    df["GELIR_qcut_GELIR_RG_RG_qcut"] = (
        df["GELIR_qcut"].astype(str)
        + "_"
        + df["GELIR_RG_qcut"].astype(str)
        + "_"
        + df["SORU_MEDENI_HAL_CVP"].astype(str)
    )




    #increase_features(df, odenen_tutar_cols, "odenen")
    #increase_features(df, vade_tutar_cols, "vade")

    #extract_tutar_stats(df, odenen_tutar_cols, "odenen")
    #extract_tutar_stats(df, vade_tutar_cols, "vade")

    df["odeme_existence_count"] = (df[odenen_tutar_cols] > 0).astype(int).sum(axis=1)

    df["odeme_diff_vade_max"] = (
        df[vade_tutar_cols].values - df[odenen_tutar_cols].values
    ).max(axis=1)
    df["odeme_diff_vade_min"] = (
        df[vade_tutar_cols].values - df[odenen_tutar_cols].values
    ).min(axis=1)
    df["odeme_diff_vade_std"] = (
        df[vade_tutar_cols].values - df[odenen_tutar_cols].values
    ).std(axis=1)
    df["odeme_diff_vade_sum"] = (
        df[vade_tutar_cols].values - df[odenen_tutar_cols].values
    ).sum(axis=1)

    #df["vade_std_mean_rat"] = df["vade_std"] / df["vade_mean"]




    df["sum_katkı_MİKTAR"] = df[
        ["SON_AY_KATKI_MIKTARI",
         "SON_CEYREK_KATKI_MIKTARI",
         "SON_SENE_KATKI_MIKTARI"]
    ].sum(axis=1)

    df["sum_katkı_ADET"] = df[
        ["SON_CEYREK_KATKI_ADET",
         "SON_SENE_KATKI_ADET",
         "SON_AY_KATKI_ADET"]
    ].sum(axis=1)




    bins = 6  # For example, creating 6 bins
    df['PP_YAS_class'] = pd.cut(df['PP_YAS'], bins=bins, labels=False)

    month_mapping = {'OCAK': 1, 'SUBAT': 2, 'MART': 3, "NISAN": 4, 'MAYIS': 5, 'HAZIRAN': 6, 'TEMMUZ': 7, 'AGUSTOS': 8, 'EYLUL': 9, 'EKIM': 10, 'KASIM': 11, 'ARALIK': 12}
    df["FLAG"] = df["FLAG"].map(month_mapping)

    df['POLICY_AGE_AT_FLAG'] = df['FLAG'].astype(int) - df['AKTIF_ILK_POLICE_RG'].fillna(0).astype(int)



    df.drop(
        ["SORU_MEDENI_HAL_CVP",
         "SORU_EGITIM_CVP",
         "PP_CINSIYET",
         "PP_MUSTERI_SEGMENTI",
         "PP_UYRUK", "PP_MESLEK",
         "IL",
         "PP_YAS",
         "GELIR_qcut",
         "GELIR_RG_qcut",
         "SORU_MEDENI_HAL_RG",
         "SORU_YATIRIM_KARAKTERI_RG",
         "SORU_EGITIM_RG",
         "SORU_COCUK_SAYISI_RG"] + \
        [col for col in df.columns if "TUTAR" in col]+
        [col for col in df.columns if "BU1" in col]+
        [col for col in df.columns if "HU0" in col]+
        [col for col in df.columns if "BU0" in col],
        axis=1,
        inplace=True,
    )




    return df

In [None]:
train_fe = extract_features(preprocess_dataset(train_df))
test_fe = extract_features(preprocess_dataset(test_df))

In [None]:
train_fe.drop(["SON_AY_KATKI_ADET", "SON_AY_KATKI_MIKTARI", "SON_CEYREK_KATKI_ADET", "SON_CEYREK_KATKI_MIKTARI", "SON_SENE_KATKI_ADET", "SON_SENE_KATKI_MIKTARI"],
              axis=1,
              inplace=True)

In [None]:
train_fe.drop(["SORU_GELIR_RG", "SORU_GELIR_CVP", "ANAPARA"],
              axis=1,
              inplace=True)

# FEATURE ENGINEERING

In [None]:
def replace_non_recurred_categorical_values(cat_col, train, test, inplace=True):
    train_vals = pd.Index(train[cat_col].unique())
    test_vals = pd.Index(test[cat_col].unique())

    exclude_vars = train_vals.union(test_vals).difference(
        train_vals.intersection(test_vals)
    )

    replace_mappings = dict(
        zip(
            exclude_vars,
            repeat(np.nan),
        )
    )

    # print(" - ".join(map(str, exclude_vars)))


    if replace_mappings:
        train[cat_col] = train[cat_col].replace(replace_mappings)
        test[cat_col] = test[cat_col].replace(replace_mappings)

In [None]:
#for cat_col in cat_cols:
    #replace_non_recurred_categorical_values(cat_col, train_fe, test_fe)

In [None]:
drop_cols = pd.Index(["MUSTERI_ID"])
label = "LABEL"

# Convert lists to sets for set operations
cat_features = list(
    set(cat_cols).intersection(train_fe.columns).union(
        set(train_fe.dtypes[train_fe.dtypes == object].index)
    )
)
cat_features.remove("LABEL")
cat_features.remove("MUSTERI_ID")

# Convert the result of difference operation from set to list
feat_cols = list(train_fe.columns.difference(drop_cols.union([label])))


print("Categorical columns after categorical FE are {}".format(", ".join(cat_features)))
print("Length of categorical columns after categorical FE are {}".format(len(cat_features)))

Categorical columns after categorical FE are meslek_medenihal, gelir_medenihal_egitim, uyruk_il, SORU_MEDENI_HAL_RG_qcut, SORU_EGITIM_RG_qcut, meslek_medenihal_egitim, ANAPARA_qcut, SORU_YATIRIM_KARAKTERI_RG_qcut, GELIR_qcut_GELIR_RG_RG_qcut, yatirimkarakteri_gelir_anapara, FLAG, SORU_YATIRIM_KARAKTERI_CVP, disrupt, SORU_COCUK_SAYISI_RG_qcut
Length of categorical columns after categorical FE are 14


# PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np

def preprocess_features(train_df, test_df, scaler, pca, feat_cols, cat_features):
    # Ortak adımlar: NaN değerlerini doldur
    fill_values = {col: train_df[col].mode()[0] for col in (set(feat_cols) - set(cat_features))}
    train_df_filled = train_df.fillna(fill_values)
    test_df_filled = test_df.fillna(fill_values)

    # Numerik ve kategorik özellikleri ayır
    train_numerical = train_df_filled.select_dtypes(include=['int64', 'float64'])
    train_categorical = train_df_filled[cat_features]
    test_numerical = test_df_filled.select_dtypes(include=['int64', 'float64'])
    test_categorical = test_df_filled[cat_features]

    # Numerik özellikleri ölçeklendir
    scaled_train_numerical = scaler.fit_transform(train_numerical)
    scaled_test_numerical = scaler.transform(test_numerical)

    # PCA uygula
    reduced_train_numerical = pca.fit_transform(scaled_train_numerical)
    reduced_test_numerical = pca.transform(scaled_test_numerical)

    # Kategorik özellikleri numpy dizisine dönüştür
    train_categorical_array = train_categorical.apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x).to_numpy()
    test_categorical_array = test_categorical.apply(lambda x: x.cat.codes if x.dtype.name == 'category' else x).to_numpy()

    # Numerik ve kategorik özellikleri birleştir
    combined_train = np.hstack((reduced_train_numerical, train_categorical_array))
    combined_test = np.hstack((reduced_test_numerical, test_categorical_array))

    # Sütun isimlerini oluştur
    pca_columns = [f'PCA_{i+1}' for i in range(reduced_train_numerical.shape[1])]
    combined_columns = pca_columns + cat_features

    # DataFrame'lere dönüştür
    combined_train_df = pd.DataFrame(combined_train, columns=combined_columns)
    combined_test_df = pd.DataFrame(combined_test, columns=combined_columns)

    return combined_train_df, combined_test_df


In [None]:
# Initialize your scaler and PCA
scaler = StandardScaler()
pca = PCA(n_components=10)

# Assuming 'train_fe' is your training data and 'feat_cols', 'cat_features' are defined
processed_train, processed_test = preprocess_features(train_fe, test_fe, scaler, pca, feat_cols, cat_features)

In [None]:
feat_cols = processed_train.columns
processed_train["LABEL"] = train_fe['LABEL']

In [None]:
processed_train

# MODEL - CATBOOST

In [None]:
train_fe

In [None]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from tqdm import tqdm

# !     #eval_metric="MultiLogloss" neden hataya sebep oldu?
SEED = 42
FOLD_CNT = 4

params = dict(
    allow_writing_files=False,
    cat_features=cat_features,
    iterations=10000,
    border_count = None ,
    random_state=SEED,
    early_stopping_rounds=1000,
    loss_function = "MultiClassOneVsAll",
    depth=2,
    learning_rate=0.01,
    max_ctr_complexity=12,
    task_type="GPU",
    class_weights = [0.0385, 0.0328, 0.2791, 0.1812, 0.0113, 0.2952, 0.1614, 0.0001],
    devices="-1",
    verbose=100
)

cv = StratifiedKFold(n_splits=FOLD_CNT, shuffle=True, random_state=SEED)
cv_splits = list(cv.split(train_fe.drop(label, axis=1).index, train_fe[label]))

# Initialize the custom metric weights
weights = {'HU06': 0.0385, 'HU07': 0.0328, 'HU11': 0.2791, 'HU12': 0.1812, 'HU14': 0.0113, 'HU15': 0.2952, 'HU19': 0.1614, 'UA': 0.0001}


# Custom weighted F1 metric function
def weighted_f1_metric(y_true, y_pred):
    unique_labels = np.unique(y_true)
    weighted_f1 = 0.0
    for label in unique_labels:
        f1 = f1_score(y_true == label, y_pred == label, average='binary')
        weighted_f1 += f1 * weights.get(label, 0)
    return weighted_f1

def calculate_f1_scores_by_class(y_true, y_pred, weights):
  unique_labels = np.unique(y_true)
  f1_scores_by_class = {}

  for label in unique_labels:
      f1 = f1_score(y_true == label, y_pred == label, average='binary')
      weighted_f1 = f1 * weights.get(label, 0)
      f1_scores_by_class[label] = weighted_f1
  return f1_scores_by_class

In [None]:
from catboost import CatBoostClassifier
from tqdm import tqdm

def train_and_evaluate(train_fe, cv_splits, feat_cols, label, params, weighted_f1_metric):
    f1_errs = []
    models = []
    best_score = 0
    best_model = None

    for split_train, split_val in tqdm(cv_splits):
        model = CatBoostClassifier(**params)

        # Fit the model
        model.fit(
            train_fe.loc[split_train, feat_cols],
            train_fe.loc[split_train, label],
            eval_set=(
                train_fe.loc[split_val, feat_cols],
                train_fe.loc[split_val, label],
            ),
            verbose=100,
        )

        # Make predictions and evaluate
        preds = model.predict(train_fe.loc[split_val, feat_cols])
        f1_err = weighted_f1_metric(train_fe.loc[split_val, label].values, preds)
        f1_errs.append(f1_err)

        # Check if this is the best model so far
        if f1_err > best_score:
            best_score = f1_err
            best_model = model

        models.append(model)
        # Uncomment the break if you want to run a single fold for quick testing
        # break

    return best_model, models, f1_errs, best_score

best_model, models, f1_errs, best_score = train_and_evaluate(train_fe, cv_splits, feat_cols, label, params, weighted_f1_metric)

  0%|          | 0/4 [00:00<?, ?it/s]

0:	learn: 0.6873818	test: 0.6873733	best: 0.6873733 (0)	total: 7.7ms	remaining: 1m 17s
100:	learn: 0.4138489	test: 0.4138036	best: 0.4138036 (100)	total: 627ms	remaining: 1m 1s
200:	learn: 0.3519264	test: 0.3522183	best: 0.3522183 (200)	total: 1.27s	remaining: 1m 1s
300:	learn: 0.3326349	test: 0.3330504	best: 0.3330504 (300)	total: 1.91s	remaining: 1m 1s
400:	learn: 0.3243250	test: 0.3252223	best: 0.3252223 (400)	total: 2.54s	remaining: 1m
500:	learn: 0.3192100	test: 0.3206741	best: 0.3206741 (500)	total: 3.21s	remaining: 1m
600:	learn: 0.3151700	test: 0.3171018	best: 0.3171018 (600)	total: 3.87s	remaining: 1m
700:	learn: 0.3117708	test: 0.3140993	best: 0.3140993 (700)	total: 4.54s	remaining: 1m
800:	learn: 0.3088575	test: 0.3115610	best: 0.3115610 (800)	total: 5.24s	remaining: 1m
900:	learn: 0.3063914	test: 0.3094778	best: 0.3094778 (900)	total: 5.91s	remaining: 59.7s
1000:	learn: 0.3043117	test: 0.3077417	best: 0.3077417 (1000)	total: 6.55s	remaining: 58.9s
1100:	learn: 0.3025127	tes

  0%|          | 0/4 [00:23<?, ?it/s]

2900:	learn: 0.2869227	test: 0.2949766	best: 0.2949766 (2900)	total: 18.7s	remaining: 45.8s





KeyboardInterrupt: 

In [None]:
full_model = models[1].fit(train_fe[feat_cols], train_fe[label])

In [None]:
f1_scores_by_class = calculate_f1_scores_by_class(train_fe.loc[split_val, "LABEL"], models[3].predict(train_fe.loc[split_val, feat_cols]), weights)

print("Averaged Weighted f1_err score:", np.mean(f1_errs))
print("Weighted f1_err score best model for each label:")
for label, score in f1_scores_by_class.items():
    print(f"{label}: {score}")

# STACK

In [None]:
X_meta = np.zeros((train_fe.shape[0], len(np.unique(train_fe.LABEL))))  # Meta-model eğitimi için boş dizi

kf = StratifiedKFold(n_splits=FOLD_CNT, shuffle=True, random_state=SEED)

# Temel modeller için tahminleri topla
for train_index, val_index in kf.split(train_fe, train_fe.LABEL):
    X_tr, X_val = train_fe.iloc[train_index][feat_cols], train_fe.iloc[val_index][feat_cols]
    y_tr, y_val = train_fe.iloc[train_index][label], train_fe.iloc[val_index][label]


    # CatBoost modelini eğit
    model = CatBoostClassifier(
        iterations=10000,
        cat_features=cat_features,
        depth=7,
        learning_rate=0.06,
        loss_function="MultiClass",
        early_stopping_rounds=1000,
        random_state=SEED,
        verbose=100,
        devices = "-1",
        task_type = "GPU"
    )
    model.fit(X_tr, y_tr, eval_set=(X_val, y_val))

    # Meta özellikleri oluştur (validasyon seti üzerindeki tahminler)


In [None]:
X_meta[val_index] = model.predict_proba(X_val)

In [None]:
from sklearn.linear_model import LogisticRegression
meta_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=SEED)
meta_model.fit(X_meta, train_fe.LABEL)

In [None]:
X_meta_test = np.zeros((test_df.shape[0], len(np.unique(train_fe["LABEL"])) * FOLD_CNT))

In [None]:
meta_model.predict(X_meta_test)

In [None]:
X_meta

# MODEL SAVE / UPLOAD & DOWNLOAD

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_save_path = '/content/drive/My Drive/Colab Model/AHE24_model_PCA'
model.save_model(model_save_path)

In [None]:
#!pip install catboost
#from catboost import CatBoostClassifier
#model = CatBoostClassifier()
#model_path = '/content/drive/My Drive/Colab Model/AHE24_model_1'

#model.load_model(model_path)

# Feature Importance

In [None]:
train_fe.LABEL.value_counts()

UA      832770
HU14     10754
HU07      3728
HU06      3178
HU19       759
HU12       676
HU11       439
HU15       415
Name: LABEL, dtype: int64

In [None]:
interpret_pool = Pool(
    train_fe.loc[:5000, feat_cols],
    train_fe.loc[:5000, "LABEL"],
    cat_features=cat_features,
)

color = "#005DAA"
pallette = [
    color,
    "#FF0B04",
]


In [None]:
import catboost as cb
for type_ in [cb.EFstrType.FeatureImportance, cb.EFstrType.LossFunctionChange]:
    feature_score = pd.DataFrame(
        list(
            zip(
                feat_cols,
                models[3].get_feature_importance(interpret_pool, type=type_),
            )
        ),
        columns=["Feature", "Score"],
    )

    feature_score = feature_score.sort_values(
        by="Score",
        ascending=True,
        inplace=False,
        kind="quicksort",
        na_position="last",
    ).assign(Score=lambda df: df["Score"] / df["Score"].sum())
    feature_score["Feature"] = (
        feature_score["Feature"].str.replace("_", " ").str.title()
    )

    feature_score = feature_score.sort_values("Score", ascending=False).head(20)

    plt.rcParams["figure.figsize"] = (16, 15 * 0.5)

    ax = sns.barplot(
        x="Score",
        y="Feature",
        data=feature_score.sort_values("Score", ascending=False),
        # label="Total",
        color=color,
    )  # feature_score.plot("Feature", "Score", kind="barh", color=color)
    ax.set_title("Feature Importance using {}".format(type_.name), fontsize=18)
    ax.set_xlabel("Score", fontsize=18)
    ax.set_ylabel("Feature", fontsize=18)
    plt.show()

In [None]:
col_ord = pd.Index(feat_cols)

In [None]:
fi = models[3].get_feature_importance(interpret_pool, type="Interaction")

fi_new = []
for k, item in enumerate(fi):
    first = col_ord[fi[k][0]]
    second = col_ord[fi[k][1]]
    if first != second:
        fi_new.append([first + " -- " + second, fi[k][2]])

feature_score = pd.DataFrame(fi_new, columns=["Feature", "Score"])

In [None]:
feature_score = (
    feature_score.sort_values(
        by="Score", ascending=False, inplace=False, kind="quicksort", na_position="last"
    )
    .assign(Score=lambda df: df["Score"] / df["Score"].sum())
    .tail(25)
)
plt.rcParams["figure.figsize"] = (20 * 1, 10)
feature_score["Feature"] = feature_score["Feature"].str.replace("_", " ").str.title()

ax = sns.barplot(
    x="Score",
    y="Feature",
    data=feature_score.sort_values("Score", ascending=False),
    # label="Total",
    color=color,
)  # feature_score.plot("Feature", "Score", kind="barh", color=color)
ax.set_title("Pair Importance", fontsize=18)
ax.set_xlabel("Score", fontsize=18)
ax.set_ylabel("Feature - Pair", fontsize=18)
plt.show()

# PRED

In [None]:
predictions = best_model.predict(test_fe[feat_cols])

In [None]:
unique_values, counts = np.unique(predictions, return_counts=True)

In [None]:
counts

array([61902, 10181,  1092, 20031,  3892,    62, 28698, 69773])

In [None]:
counts

array([88347, 14734,  1223, 52692,  4270,    57, 34308])

In [None]:
counts

array([51295,  8223,  2266, 17422, 10355,   122, 20798, 85150])

# SUB

In [None]:
submission["LABEL"] = predictions

In [None]:
submission.to_csv("submission_27.csv", index=False)

# Manuel Replace
It did not worked. F1 droped significantly.

In [None]:
import numpy as np
num_elements = len(predictions)
num_to_replace = int(num_elements * (60 / 100.0))

# Rastgele seçilecek indeksleri belirle
indices_to_replace = np.random.choice(num_elements, size=num_to_replace, replace=False)

# Seçilen indekslerdeki değerleri 'UA' ile değiştir
predictions[indices_to_replace] = 'UA'

print(predictions)

# YATIRIM KARAKTERİ MODEL

In [None]:
train_fe_yatirim = train_fe.copy()
train_fe_yatirim["YATIRIM_KARAKTERI"] = train_df["SORU_YATIRIM_KARAKTERI_CVP"]

In [None]:
fe_columns = train_fe_yatirim.columns.to_list()
fe_columns_num = set(fe_columns) - set([col for col in fe_columns if (col.__contains__("BU") | col.__contains__("HU") |  col.__contains__("_RG")) ])
train_fe_yatirim = train_fe_yatirim[fe_columns_num]

In [None]:
train_fe_yatirim.drop(["LABEL", "MUSTERI_ID"], inplace=True, axis=1)

In [None]:
train_fe_yatirim = train_fe_yatirim.loc[train_fe_yatirim["YATIRIM_KARAKTERI"].notna(), :]

In [None]:
train_fe_yatirim.reset_index(inplace=True)
train_fe_yatirim.drop("index", inplace=True, axis=1)

In [None]:
cv = StratifiedKFold(n_splits=12, shuffle=True, random_state=SEED)
cv_splits = list(cv.split(train_fe_yatirim.drop("YATIRIM_KARAKTERI", axis=1).index, train_fe_yatirim["YATIRIM_KARAKTERI"]))

In [None]:
label = "YATIRIM_KARAKTERI"
feat_cols = set(train_fe_yatirim.columns) - set("YATIRIM_KARAKTERI")

In [None]:
feat_cols.remove("YATIRIM_KARAKTERI")

In [None]:
feat_cols

In [None]:
f1_errs = []
models = []

for split_train, split_val in tqdm(cv_splits):

    model = CatBoostClassifier(**params)

    model.fit(
        train_fe_yatirim.loc[split_train, feat_cols],
        train_fe_yatirim.loc[split_train, label],
        eval_set=(
            train_fe_yatirim.loc[split_val, feat_cols],
            train_fe_yatirim.loc[split_val, label],
        ),
        verbose=100,
    )
    preds = model.predict(train_fe_yatirim.loc[split_val, feat_cols])


    # Use the custom weighted F1 metric for evaluation
    f1_err = f1_score(train_fe_yatirim.loc[split_val, label].values, preds)
    f1_errs.append(f1_err)
    # Save the best model so far
    best_score = 0
    if best_score == 0:
        best_score = f1_err
    elif f1_err > best_score:
        best_score = f1_err
        best_model = model
    else:
        pass


    models.append(model)
    #break  # If you want to train and evaluate on just the first fold, keep this. Otherwise, remove it to process all folds.

In [None]:
train_fe_yatirim.loc[split_train, feat_cols]

Unnamed: 0,odenen_skew,ODEME_TUTAR_8,sum_katkı_ADET,odeme_existence_count,sum_katkı_MİKTAR,GELIR_qcut,ODEME_TUTAR_2,ODEME_TUTAR_5,odenen_var,SON_SENE_KATKI_ADET,gelir_medenihal_egitim,SON_CEYREK_KATKI_MIKTARI,VADE_TUTAR_2,VADE_TUTAR_3,uyruk_il,SORU_COCUK_SAYISI_CVP,BILGI_TALEP_ADET,odenen_q75,SON_CEYREK_KATKI_ADET,vade_q25,ODEME_TUTAR_0,odenen_q25,ODEME_TUTAR_4,vade_q95,vade_q75,vade_skew,ODEME_TUTAR_6,ODEME_TUTAR_7,odenen_max,VADE_TUTAR_9,VADE_TUTAR_11,odenen_increase_last_month,odeme_diff_vade_sum,ODEME_TUTAR_3,vade_mean,odenen_min,ODEME_TUTAR_9,vade_q05,vade_kurt,VADE_TUTAR_0,vade_std_mean_rat,VADE_TUTAR_6,vade_std,FLAG,ODEME_TUTAR_11,odenen_kurt,odenen_sum,BES_AYRILMA_TALEP_ADET,vade_var,PP_YAS_class,VADE_TUTAR_7,odenen_q95,vade_max,SORU_GELIR_CVP,SON_SENE_KATKI_MIKTARI,ANAPARA,HAYAT_AYRILMA_TALEP_ADET,SON_AY_KATKI_MIKTARI,GETIRI,SON_AY_KATKI_ADET,VADE_TUTAR_10,vade_increase_last_month,vade_increase_month_count,odeme_diff_vade_max,VADE_TUTAR_4,YATIRIM_KARAKTERI,VADE_TUTAR_5,ODEME_TUTAR_1,ODEME_TUTAR_10,odeme_diff_vade_min,meslek_medenihal_egitim,odenen_increase_month_count,odeme_diff_vade_std,vade_min,ODEMEME_TALEP_ADET,odenen_mean,vade_sum,meslek_medenihal,POLICY_AGE_AT_FLAG,odenen_std,VADE_TUTAR_1,VADE_TUTAR_8,odenen_q05
0,0.388403,631,0.0,12,0.0,-1,1172,631,77604.810606,,-1_Single_Lisans,,1172,1172,1_6,0.0,0,1172.0,,631.0,1172,631.0,1172,1172.00,1172.0,0.388403,631,631,1172,631,631,-1.0,0,1172,856.416667,631,631,631.0,-2.262857,1172,0.325281,631,278.576400,6,631,-2.262857,10277,0,77604.810606,2,631,1172.00,1172,,,22929,0,,16055,,631,-1.0,0.0,0,1172,Yetkin,631,1172,631,0,Muhasebe_Single_Lisans,0.0,0.000000,631,0,856.416667,10277,Muhasebe_Single,-60,278.576400,1172,631,631.0
1,0.166762,0,0.0,6,0.0,-1,353,191,30972.545455,,-1_Single_(Diğer),,353,353,1_6,,0,353.0,,0.0,353,0.0,353,353.00,353.0,0.166762,0,0,353,0,0,-1.0,0,353,163.000000,0,0,0.0,-2.267172,353,1.079694,0,175.990186,6,0,-2.267172,1956,0,30972.545455,0,0,353.00,353,,,1749,0,,15,,0,-1.0,0.0,0,353,Cesur,191,353,0,0,UNKNOWN_Single_(Diğer),0.0,0.000000,0,0,163.000000,1956,UNKNOWN_Single,2,175.990186,353,0,0.0
2,1.067933,267,0.0,12,0.0,6,382,267,6011.363636,,6_Married_Önlisans,,382,382,1_34,0.0,0,382.0,,267.0,382,267.0,497,433.75,382.0,1.067933,267,267,497,267,267,3.0,0,382,324.500000,267,267,267.0,0.352000,382,0.238931,267,77.532984,6,267,0.352000,3894,0,6011.363636,1,267,433.75,497,10000.0,,17647,0,,39857,,267,3.0,1.0,0,497,Bilge,267,382,267,0,Satış_Married_Önlisans,1.0,0.000000,267,0,324.500000,3894,Satış_Married,-92,77.532984,382,267,267.0
3,0.431647,689,0.0,12,0.0,2,957,957,30796.992424,,2_Married_UNKNOWN,,957,957,1_6,0.0,0,957.0,,689.0,957,689.0,957,1098.75,957.0,0.431647,957,1272,1272,689,689,6.0,0,957,893.916667,689,689,689.0,0.551721,957,0.196317,957,175.490719,9,689,0.551721,10727,0,30796.992424,2,1272,1098.75,1272,2500.0,,50569,0,,135715,,689,6.0,1.0,0,957,Dengeli,957,957,689,0,Öğretmen_Married_UNKNOWN,1.0,0.000000,689,0,893.916667,10727,Öğretmen_Married,-115,175.490719,957,689,689.0
4,0.000000,0,0.0,6,0.0,5,478,478,62313.818182,,5_Single_(Diğer),,478,478,1_33,,0,478.0,,0.0,478,0.0,478,478.00,478.0,0.000000,0,0,478,0,0,-1.0,0,478,239.000000,0,0,0.0,-2.444444,478,1.044466,0,249.627359,5,0,-2.444444,2868,0,62313.818182,0,0,478.00,478,8000.0,,2663,0,,198,,0,-1.0,0.0,0,478,Bilge,478,478,0,0,Öğrenci_Single_(Diğer),0.0,0.000000,0,0,239.000000,2868,Öğrenci_Single,1,249.627359,478,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132693,3.037459,1263,0.0,12,0.0,4,1435,1263,94989.113636,,4_Married_(Diğer),,1435,1435,1_34,0.0,0,1435.0,,1263.0,1435,1263.0,2354,1848.55,1435.0,3.037459,1263,1263,2354,1263,1263,3.0,0,1435,1411.250000,1263,1263,1263.0,9.820935,1435,0.218390,1263,308.203040,6,1263,9.820935,16935,0,94989.113636,2,1263,1848.55,2354,5000.0,,79033,0,,170938,,1263,3.0,1.0,0,2354,Dengeli,1263,1435,1263,0,Danışman_Married_(Diğer),1.0,0.000000,1263,0,1411.250000,16935,Danışman_Married,-101,308.203040,1435,1263,1263.0
132694,0.552426,287,0.0,12,0.0,-1,535,287,19192.242424,,-1_Single_(Diğer),,535,535,1_35,,0,535.0,,287.0,535,287.0,287,535.00,535.0,0.552426,287,287,535,287,191,-1.0,0,535,353.666667,191,287,191.0,-1.533426,535,0.391714,287,138.536069,5,191,-1.533426,4244,0,19192.242424,0,287,535.00,535,,,5205,0,,1150,,191,-1.0,0.0,0,287,Temkinli,287,535,191,0,UNKNOWN_Single_(Diğer),0.0,0.000000,191,0,353.666667,4244,UNKNOWN_Single,-13,138.536069,535,287,191.0
132695,0.388403,210,0.0,12,0.0,3,392,210,8782.878788,,3_Single_Lisans,,392,392,1_41,,0,392.0,,210.0,392,210.0,392,392.00,392.0,0.388403,210,210,392,210,210,-1.0,0,392,285.833333,210,210,210.0,-2.262857,392,0.327873,210,93.717014,6,210,-2.262857,3430,0,8782.878788,0,210,392.00,392,4000.0,,4973,0,,1766,,210,-1.0,0.0,0,392,Dengeli,210,392,210,0,Satış_Single_Lisans,0.0,0.000000,210,0,285.833333,3430,Satış_Single,-15,93.717014,392,210,210.0
132696,-0.314841,354,0.0,9,0.0,-1,0,354,38359.636364,,-1_Married_İlköğretim,,660,660,1_35,,0,354.0,,354.0,0,265.5,354,660.00,660.0,0.812404,354,354,660,354,354,2.0,1980,660,456.000000,0,354,354.0,-1.650000,660,0.330403,354,150.663985,5,354,0.269294,3492,0,22699.636364,2,354,491.70,660,,,14191,0,,12830,,354,-1.0,0.0,660,354,Dengeli,354,0,354,0,Diğer_Married_İlköğretim,1.0,285.788383,354,0,291.000000,5472,Diğer_Married,-54,195.856162,660,354,0.0


In [None]:
train_fe_yatirim.info()