In [1]:
import numpy as np
import pandas as pd
import gc
from xgboost import XGBClassifier
import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('data/train_preprocessed.csv', low_memory=False)
test = pd.read_csv('data/test_preprocessed.csv', low_memory=False)

In [3]:
train.head()

Unnamed: 0,MUSTERI_ID,LABEL,FLAG,PP_CINSIYET,PP_YAS,PP_MESLEK,IL,SORU_YATIRIM_KARAKTERI_RG,SORU_MEDENI_HAL_RG,SORU_EGITIM_RG,...,Regions_Akdeniz,Regions_Doğu Anadolu,Regions_Ege,Regions_Güneydoğu Anadolu,Regions_Karadeniz,Regions_Marmara,Regions_Rare,Regions_İç Anadolu,PC_1,PC_2
0,59fd829fc435bf3ed0aa832ca0c89397e5197d8cc13518...,-1,5,2,325.0,-0.939987,47,0.0,46.0,46.0,...,False,False,False,True,False,False,False,False,-0.327933,-0.119758
1,040f50ce230c6c89d1ae9fa2d7187ad34212bd20623350...,-1,5,2,541.0,-0.907518,34,0.0,225.0,225.0,...,False,False,False,False,False,True,False,False,-1.130885,-0.624636
2,1372676078ce2f7863232bd9bd76b17dd65d07ebf4fb93...,-1,5,1,622.0,-0.986912,10,0.0,97.0,0.0,...,False,False,True,False,False,False,False,False,-0.443118,-0.163713
3,625516d0afe66e8c9c207ed98b18a0123f8528f152fa7a...,-1,5,1,495.0,-0.986912,63,0.0,132.0,0.0,...,False,False,False,True,False,False,False,False,-0.470187,-0.248751
4,84bd7c9b3526166b6ae1c1873d5f699d7a59232b55c445...,-1,5,2,22.0,-0.986912,63,0.0,20.0,20.0,...,False,False,False,True,False,False,False,False,-0.631859,-0.326021


**Veri seti çok dengesiz olduğu için(~97% - ~3%), ilk olarak `UA` sınıfını diğer sınıflardan ayırt etmeyi düşündük. Bunun için `UA` sınıfını 0, diğer sınıfları 1 olarak etiketleyerek `binary classification` modeli eğittik. Bu eğitim sırasında, accuracy kaybetmeden pozitif olan sınıfı yakalamak istediğimiz için `Recall - Accuracy` metriklerinin trade-off'una dikkat ettik.**

In [4]:
train.loc[train[train["LABEL"] == -1].index, "DummyLabel"] = 0
train.loc[train[train["LABEL"] != -1].index, "DummyLabel"] = 1
train.drop("LABEL", axis=1, inplace=True)

In [5]:
train.drop(["MUSTERI_ID"], axis=1, inplace=True)
test.drop(["MUSTERI_ID"], axis=1, inplace=True)

**Bu aşamada birden çok model denemesi yaptık ve en iyi sonuçları XGBoostClassifier ile aldık. O yüzden nihai `binary classification` modelimiz `XGBoostClassifier` oldu.**

**`Scale Pos Weight Parametresi:`** `UA` dışında kalan sınıfların ağırlıkları dengesizlikten dolayı eşit olmadığı için, `scale_pos_weight` parametresini kullanarak bu dengesizliği gidermeye çalıştık. Bu parametre, negatif (çoğunluk) sınıfın ağırlığını pozitif (azınlık) sınıfın ağırlığına oranlar. Bu sayede model, negatif sınıfı pozitif sınıf kadar önemli görmeye başlar. Bu parametreyi, `UA` sınıfının oranını diğer sınıfların oranına bölerek hesapladık.

In [6]:
%%time
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier

recall_scores = []
acc_scores = []
X = train.drop("DummyLabel", axis=1)
y = train["DummyLabel"]

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Şimdi kf.split() metodunu doğru şekilde kullanalım
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    model = XGBClassifier(
        scale_pos_weight = scale_pos_weight, 
        n_estimators=350,
        tree_method='gpu_hist',  
        device='cuda',
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50)
    y_pred = model.predict(X_test)
    print(f"Fold {i+1}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F1: {f1_score(y_test, y_pred)}")
    recall_scores.append(recall_score(y_test, y_pred))
    acc_scores.append(accuracy_score(y_test, y_pred))

print(f"Recall: {np.mean(recall_scores)},  Accuracy: {np.mean(acc_scores)}")

[0]	validation_0-logloss:0.59021
[50]	validation_0-logloss:0.33869
[100]	validation_0-logloss:0.30561
[150]	validation_0-logloss:0.28377
[200]	validation_0-logloss:0.26609
[250]	validation_0-logloss:0.25069
[300]	validation_0-logloss:0.23717
[349]	validation_0-logloss:0.22436
Fold 1
Recall: 0.7097744360902256
Accuracy: 0.90754292147481
Precision: 0.16236670106639148
F1: 0.2642777155655095
[0]	validation_0-logloss:0.59060
[50]	validation_0-logloss:0.33713
[100]	validation_0-logloss:0.30732
[150]	validation_0-logloss:0.28594
[200]	validation_0-logloss:0.26850
[250]	validation_0-logloss:0.25310
[300]	validation_0-logloss:0.23916
[349]	validation_0-logloss:0.22673
Fold 2
Recall: 0.7152882205513784
Accuracy: 0.9071911061075147
Precision: 0.162658155705004
F1: 0.2650445765230312
[0]	validation_0-logloss:0.59076
[50]	validation_0-logloss:0.33696
[100]	validation_0-logloss:0.30420
[150]	validation_0-logloss:0.28194
[200]	validation_0-logloss:0.26475
[250]	validation_0-logloss:0.24914
[300]	val

In [7]:
gc.collect()

195

**Prediction yapıyoruz.**

In [8]:
pred = model.predict(test)

In [9]:
test = pd.concat([test, pd.Series(pred, name="DummyLabel")], axis=1)

In [10]:
test.head()

Unnamed: 0,FLAG,PP_CINSIYET,PP_YAS,PP_MESLEK,IL,SORU_YATIRIM_KARAKTERI_RG,SORU_MEDENI_HAL_RG,SORU_EGITIM_RG,SORU_GELIR_CVP,SORU_GELIR_RG,...,Regions_Doğu Anadolu,Regions_Ege,Regions_Güneydoğu Anadolu,Regions_Karadeniz,Regions_Marmara,Regions_Rare,Regions_İç Anadolu,PC_1,PC_2,DummyLabel
0,11,2,467.0,-0.926782,1.0,0.0,101.0,0.0,3000.0,101.0,...,False,False,False,False,False,False,False,0.208574,-0.442149,0
1,11,1,658.0,-0.926782,54.0,0.0,112.0,112.0,2800.0,112.0,...,False,False,False,False,True,False,False,-0.314349,-0.039449,0
2,11,1,634.0,-0.963381,48.0,0.0,219.0,219.0,2000.0,219.0,...,False,True,False,False,False,False,False,-0.602051,0.69381,0
3,11,2,638.0,-0.992338,67.0,0.0,208.0,208.0,0.0,208.0,...,False,False,False,True,False,False,False,-0.494246,-0.248365,0
4,11,1,80.0,-0.965346,34.0,0.0,4.0,4.0,0.0,0.0,...,False,False,False,False,True,False,False,-0.631607,-0.330536,0


In [11]:
test['DummyLabel'].value_counts(normalize=True)

DummyLabel
0    0.868513
1    0.131487
Name: proportion, dtype: float64

**Şimdi `1` olarak tahmin ettiğimiz azınlık sınıflar için model geliştirdik.**

In [12]:
train = pd.read_csv('data/train_preprocessed.csv', low_memory=False)

In [13]:
train = train[train["LABEL"] != -1]

**Test setinde trend değişim ihtimali olduğunu düşündüğümüz içim modelimizde `FLAG` özelliğini kullanmak istemedik.**

In [14]:
train.drop(columns=['MUSTERI_ID', 'FLAG'], inplace=True) 
test.drop(columns=['FLAG'], inplace=True)

In [15]:
train['LABEL'] = train['LABEL'].astype(int)

In [16]:
%%time
from autogluon.tabular import TabularDataset, TabularPredictor
import autogluon.common as ag
from autogluon.common import space
from custom_metrics import weighted_f1_scorer

time_limit = 3600*10

automl = TabularPredictor(label='LABEL', problem_type='multiclass',
                          eval_metric=weighted_f1_scorer
                          )

automl.fit(train, presets='best_quality', time_limit=time_limit, num_bag_folds=5, num_bag_sets=0, num_stack_levels=1, dynamic_stacking=False, 
            included_model_types=['XGB', 'CAT', 'GBM', 'RF', 'XT'], ag_args_fit={'num_gpus': 1, 'num_cpus': 8},
            infer_limit_batch_size=10, 
           hyperparameters = {
              # 'GBM': [
              #     {'max_depth': 10, 'n_estimators': 10000},
              #     {'max_depth': 9, 'n_estimators': 10000},
              #     {'max_depth': 8, 'n_estimators': 10000},
              #     {'max_depth': 7, 'n_estimators': 10000},
              #     {'max_depth': 6, 'n_estimators': 10000},
              #     {'max_depth': 5, 'n_estimators': 10000},
              #     {'max_depth': 4, 'n_estimators': 10000},
              #     {'max_depth': 3, 'n_estimators': 10000},
              #     {'max_depth': 2, 'n_estimators': 10000},
              # ],
              'XGB': [
                  
                  # {'max_depth': 10, 'n_estimators': 10000},
                  # {'max_depth': 9, 'n_estimators': 10000},
                  # {'max_depth': 8, 'n_estimators': 10000},
                  # {'max_depth': 7, 'n_estimators': 10000},
                  {'max_depth': 6, 'n_estimators': 100000},
                  {'max_depth': 5, 'n_estimators': 100000},
                  {'max_depth': 4, 'n_estimators': 100000},
                  {'max_depth': 3, 'n_estimators': 100000},
                  {'max_depth': 2, 'n_estimators': 100000},
              ],
              'CAT': [
                  # {'depth': 10, 'iterations': 10000},
                  # {'depth': 9, 'iterations': 10000},
                  # {'depth': 8, 'iterations': 10000},
                  # {'depth': 7, 'iterations': 10000},
                  {'depth': 6, 'iterations': 100000},
                  {'depth': 5, 'iterations': 100000},
                  {'depth': 4, 'iterations': 100000}, 
                  {'depth': 3, 'iterations': 100000},
                  {'depth': 2, 'iterations': 100000},
              ],
              # 'XT': [
              #     {'max_depth': 10, 'n_estimators': 10000},
              #     {'max_depth': 9, 'n_estimators': 10000},
              #     {'max_depth': 8, 'n_estimators': 10000},
              #     {'max_depth': 7, 'n_estimators': 10000},
              #     {'max_depth': 6, 'n_estimators': 10000},
              #     {'max_depth': 5, 'n_estimators': 10000},
              #     {'max_depth': 4, 'n_estimators': 10000},
              #     {'max_depth': 3, 'n_estimators': 10000},
              #     {'max_depth': 2, 'n_estimators': 10000},
              # ],
              # 'RF': [
              #     {'max_depth': 10, 'n_estimators': 10000},
              #     {'max_depth': 9, 'n_estimators': 10000},
              #     {'max_depth': 8, 'n_estimators': 10000},
              #     {'max_depth': 7, 'n_estimators': 10000},
              #     {'max_depth': 6, 'n_estimators': 10000},
              #     {'max_depth': 5, 'n_estimators': 10000},
              #     {'max_depth': 4, 'n_estimators': 10000},
              #     {'max_depth': 3, 'n_estimators': 10000},
              #     {'max_depth': 2, 'n_estimators': 10000},
              # ],
              }
          )

No path specified. Models will be saved in: "AutogluonModels\ag-20240605_065455"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=5, num_bag_sets=0
Beginning AutoGluon training ... Time limit = 36000s
AutoGluon will save models to "AutogluonModels\ag-20240605_065455"
AutoGluon Version:  1.1.1b20240426
Python Version:     3.10.10
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       7.32 GB / 15.42 GB (47.5%)
Disk Space Avail:   556.95 GB / 931.51 GB (59.8%)
Train Data Rows:    19949
Train Data Columns: 126
Label Column:       LABEL
Problem Type:       multiclass
Preprocessing data ...
Train Data Class Count: 7
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    7492.24 MB
	Train Data (Original)  Memory Usage: 14.52 MB (0.2% of available memory)
	Inferring data type of each fe

CPU times: total: 48min 32s
Wall time: 33min 55s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2242e0f73d0>

In [17]:
automl.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost_2_BAG_L2,0.443945,custom_weighted_f1,7.156369,1471.460238,0.414371,98.694752,2,True,18
1,WeightedEnsemble_L3,0.443945,custom_weighted_f1,7.163345,1473.482795,0.006976,2.022557,3,True,22
2,CatBoost_5_BAG_L2,0.443146,custom_weighted_f1,6.838387,1409.467266,0.096388,36.701779,2,True,16
3,XGBoost_5_BAG_L1,0.443143,custom_weighted_f1,1.938882,313.739692,1.938882,313.739692,1,True,10
4,WeightedEnsemble_L2,0.443143,custom_weighted_f1,1.946871,315.822878,0.007988,2.083186,2,True,11
5,XGBoost_BAG_L2,0.441537,custom_weighted_f1,7.077991,1490.602757,0.335992,117.837271,2,True,17
6,XGBoost_3_BAG_L2,0.440647,custom_weighted_f1,7.240001,1451.711408,0.498003,78.945922,2,True,19
7,CatBoost_3_BAG_L2,0.43884,custom_weighted_f1,6.827994,1405.422011,0.085995,32.656525,2,True,14
8,XGBoost_4_BAG_L1,0.437551,custom_weighted_f1,1.240052,152.759021,1.240052,152.759021,1,True,9
9,XGBoost_4_BAG_L2,0.436006,custom_weighted_f1,7.241196,1448.252599,0.499198,75.487113,2,True,20


In [18]:
predictions = automl.predict(test[test["DummyLabel"] == 1].drop("DummyLabel", axis=1), model='WeightedEnsemble_L3')

In [19]:
test.loc[test[test["DummyLabel"] == 0].index, "LABEL"] = "UA"

test.loc[test[test["DummyLabel"] == 1].index, "LABEL"] = predictions

In [20]:
encoding_map = {'HU06': 0, 'HU07': 1, 'HU11': 2,
                'HU12': 3, 'HU14': 4, 'HU15': 5, 'HU19': 6,
                }

test["LABEL"] = test["LABEL"].map({v: k for k, v in encoding_map.items()})
test["LABEL"].fillna("UA", inplace=True)

In [21]:
submission = pd.read_csv("sample_submission.csv")
submission["LABEL"] = test["LABEL"]
submission.to_csv("submission.csv", index=False)

In [22]:
submission.LABEL.value_counts(normalize=True)

LABEL
UA      0.868513
HU14    0.091356
HU07    0.025379
HU06    0.010796
HU19    0.002091
HU12    0.001196
HU11    0.000491
HU15    0.000179
Name: proportion, dtype: float64