# Import Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

In [2]:
def eval_metric(model, X_train, y_train, X_test, y_test):

    """ to get the metrics for the model """

    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)

    print("Train Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))
    print()
    print("Test Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# Data Preprocessing

In [3]:
train = pd.read_csv("btk_train.csv")
test = pd.read_csv("btk_test_x.csv")

train.drop(columns = "index", inplace = True)
test.drop(columns = "index", inplace = True)

train.Cinsiyet.replace({"Kadın" : 0, "Erkek" : 1}, inplace = True)
test.Cinsiyet.replace({"Kadın" : 0, "Erkek" : 1}, inplace = True)

train["Medeni Durum"].replace({"Evli" : 1, "Bekar" : 0}, inplace = True)
test["Medeni Durum"].replace({"Evli" : 1, "Bekar" : 0}, inplace = True)

train["Eğitime Devam Etme Durumu"].replace({"Ediyor" : 1, "Etmiyor" : 0}, inplace = True)
test["Eğitime Devam Etme Durumu"].replace({"Ediyor" : 1, "Etmiyor" : 0}, inplace = True)

train["Öbek İsmi"].replace({"obek_1" : 0, "obek_2" : 1, "obek_3" : 2, "obek_4" : 3, "obek_5" : 4, "obek_6" : 5, "obek_7" : 6, "obek_8" : 7}, inplace = True)

train.rename(columns = {"Cinsiyet" : "cinsiyet", "Yaş Grubu" : "yas_grubu" , "Medeni Durum" : "medeni_durum", "Eğitim Düzeyi" : "egitim_duzeyi",
                       "İstihdam Durumu" : "istihdam_durumu", "Yıllık Ortalama Gelir" : "ort_gelir", "Yaşadığı Şehir" : "sehir", "En Çok İlgilendiği Ürün Grubu" : "ilgilendiği_ürün",
                       "Yıllık Ortalama Satın Alım Miktarı" : "ort_satın_alma", "Yıllık Ortalama Sipariş Verilen Ürün Adedi" : "ort_adet", "Eğitime Devam Etme Durumu" : "egitim_devam",
                       "Öbek İsmi" : "öbek", "Yıllık Ortalama Sepete Atılan Ürün Adedi" : "ort_sepete_atma"}, inplace = True)

test.rename(columns = {"Cinsiyet" : "cinsiyet", "Yaş Grubu" : "yas_grubu" , "Medeni Durum" : "medeni_durum", "Eğitim Düzeyi" : "egitim_duzeyi",
                       "İstihdam Durumu" : "istihdam_durumu", "Yıllık Ortalama Gelir" : "ort_gelir", "Yaşadığı Şehir" : "sehir", "En Çok İlgilendiği Ürün Grubu" : "ilgilendiği_ürün",
                       "Yıllık Ortalama Satın Alım Miktarı" : "ort_satın_alma", "Yıllık Ortalama Sipariş Verilen Ürün Adedi" : "ort_adet", "Eğitime Devam Etme Durumu" : "egitim_devam",
                       "Yıllık Ortalama Sepete Atılan Ürün Adedi" : "ort_sepete_atma"}, inplace = True)

train = pd.concat([train, pd.get_dummies(train.ilgilendiği_ürün, drop_first = True, dtype = "int")], axis = 1).drop(columns = "ilgilendiği_ürün")
test = pd.concat([test, pd.get_dummies(test.ilgilendiği_ürün, drop_first = True, dtype = "int")], axis = 1).drop(columns = "ilgilendiği_ürün")

train.rename(columns = {"Ev ve Mobilya" : "ev_mobilya", "Giyim" : "giyim", 
                        "Kozmetik" : "kozmetik" , "Spor Malzemeleri" : "spor_malz"}, inplace = True)

test.rename(columns = {"Ev ve Mobilya" : "ev_mobilya", "Giyim" : "giyim", 
                        "Kozmetik" : "kozmetik" , "Spor Malzemeleri" : "spor_malz"}, inplace = True)

train = pd.concat([train, pd.get_dummies(train.yas_grubu, drop_first = True, dtype = "int")], axis = 1).drop(columns = "yas_grubu")
test = pd.concat([test, pd.get_dummies(test.yas_grubu, drop_first = True, dtype = "int")], axis = 1).drop(columns = "yas_grubu")

train = pd.concat([train, pd.get_dummies(train.istihdam_durumu, drop_first = True, dtype = "int")], axis = 1).drop(columns = "istihdam_durumu")
test = pd.concat([test, pd.get_dummies(test.istihdam_durumu, drop_first = True, dtype = "int")], axis = 1).drop(columns = "istihdam_durumu")

train.rename(columns = {"Kendi İşinin Sahibi" : "kendi_is", "İşsiz veya Düzenli Bir İşi Yok" : "issiz", 
                        "Emekli" : "emekli"}, inplace = True)

test.rename(columns = {"Kendi İşinin Sahibi" : "kendi_is", "İşsiz veya Düzenli Bir İşi Yok" : "issiz", 
                        "Emekli" : "emekli"}, inplace = True)

train = pd.concat([train, pd.get_dummies(train.sehir, drop_first = True, dtype = "int")], axis = 1).drop(columns = "sehir")
test = pd.concat([test, pd.get_dummies(test.sehir, drop_first = True, dtype = "int")], axis = 1).drop(columns = "sehir")

train.rename(columns = {"Küçük Şehir" : "küc_sehir", "Köy veya Kasaba" : "köy", 
                        "Kırsal" : "kırsal"}, inplace = True)

test.rename(columns = {"Küçük Şehir" : "küc_sehir", "Köy veya Kasaba" : "köy", 
                        "Kırsal" : "kırsal"}, inplace = True)

train = pd.concat([train, pd.get_dummies(train.egitim_duzeyi, drop_first = True, dtype = "int")], axis = 1).drop(columns = "egitim_duzeyi")
test = pd.concat([test, pd.get_dummies(test.egitim_duzeyi, drop_first = True, dtype = "int")], axis = 1).drop(columns = "egitim_duzeyi")

train.rename(columns = {"Üniversite Mezunu" : "üniversite", "Yüksekokul Mezunu" : "yüksek_okul", 
                        "Yüksek Lisans Mezunu" : "yük_lisans", "İlkokul Mezunu" : "ilkokul",
                       "Ortaokul Mezunu" : "ortaokul", "Eğitimsiz" : "egitimsiz",
                       "Lise Mezunu" : "lise", "Doktora Ötesi" : "doktora+"}, inplace = True)

test.rename(columns = {"Üniversite Mezunu" : "üniversite", "Yüksekokul Mezunu" : "yüksek_okul", 
                        "Yüksek Lisans Mezunu" : "yük_lisans", "İlkokul Mezunu" : "ilkokul",
                       "Ortaokul Mezunu" : "ortaokul", "Eğitimsiz" : "egitimsiz",
                       "Lise Mezunu" : "lise", "Doktora Ötesi" : "doktora+"}, inplace = True)

# Train | Test Split

In [4]:
X = train.drop(columns = ["öbek"])
y = train.öbek
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state = 1)

# Modelling

In [5]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

eval_metric(rf, X_train, y_train, X_test, y_test)

Train Set
[[362   0   0   0   0   0   0   0]
 [  0 284   0   0   0   0   0   0]
 [  0   0 337   0   0   0   0   0]
 [  0   0   0 347   0   0   0   0]
 [  0   0   0   0 354   0   0   0]
 [  0   0   0   0   0 350   0   0]
 [  0   0   0   0   0   0 330   0]
 [  0   0   0   0   0   0   0 366]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       362
           1       1.00      1.00      1.00       284
           2       1.00      1.00      1.00       337
           3       1.00      1.00      1.00       347
           4       1.00      1.00      1.00       354
           5       1.00      1.00      1.00       350
           6       1.00      1.00      1.00       330
           7       1.00      1.00      1.00       366

    accuracy                           1.00      2730
   macro avg       1.00      1.00      1.00      2730
weighted avg       1.00      1.00      1.00      2730


Test Set
[[316   0   4   3   1   3   3   0]
 [  0 244   2

In [6]:
param_grid = {'n_estimators':[50, 64, 100, 128, 300],     #64 ve 128 skorlarina mutlaka bakilmasi gerektigini belirtmistik.
             'max_features':[2, 3, 4],
             'max_depth':[3, 5, 7, 9],
             'min_samples_split':[2, 5, 8]}

rf = RandomForestClassifier(random_state = 1)

rf_grid_model = GridSearchCV(rf, param_grid, scoring = "accuracy", n_jobs = -1, verbose = 3).fit(X_train, y_train)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


In [7]:
eval_metric(rf_grid_model, X_train, y_train, X_test, y_test)

Train Set
[[343   2   3   5   3   0   3   3]
 [  3 273   2   1   0   1   3   1]
 [  1   0 328   1   1   2   3   1]
 [  2   1   4 329   2   2   4   3]
 [  1   0   2   1 344   2   1   3]
 [  2   0   2   3   1 340   1   1]
 [  2   1   0   3   1   2 320   1]
 [  2   3   1   2   3   1   3 351]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       362
           1       0.97      0.96      0.97       284
           2       0.96      0.97      0.97       337
           3       0.95      0.95      0.95       347
           4       0.97      0.97      0.97       354
           5       0.97      0.97      0.97       350
           6       0.95      0.97      0.96       330
           7       0.96      0.96      0.96       366

    accuracy                           0.96      2730
   macro avg       0.96      0.96      0.96      2730
weighted avg       0.96      0.96      0.96      2730


Test Set
[[316   0   4   3   1   3   3   0]
 [  1 242   2

In [8]:
class_names = ["1", "2", "3", "4", "5", "6", "7", "8"]
df2 = pd.DataFrame(rf_grid_model.predict_proba(X_test), columns = class_names)
df2.head()

Unnamed: 0,1,2,3,4,5,6,7,8
0,0.019624,0.035843,0.027235,0.059362,0.00984,0.708084,0.12086,0.019153
1,0.012314,0.005545,0.005104,0.874132,0.00813,0.022119,0.046,0.026657
2,0.07428,0.08056,0.013848,0.033275,0.727114,0.007857,0.032341,0.030725
3,0.701019,0.078683,0.108118,0.015665,0.057616,0.007509,0.023119,0.008272
4,0.084502,0.159285,0.04445,0.047348,0.516072,0.00742,0.076473,0.06445


In [9]:
X_test["1"] = df2["1"].values
X_test["2"] = df2["2"].values
X_test["3"] = df2["3"].values
X_test["4"] = df2["4"].values
X_test["5"] = df2["5"].values
X_test["6"] = df2["6"].values
X_test["7"] = df2["7"].values
X_test["8"] = df2["8"].values
X_test.head()

Unnamed: 0,cinsiyet,medeni_durum,ort_gelir,ort_satın_alma,ort_adet,egitim_devam,ort_sepete_atma,ev_mobilya,giyim,kozmetik,...,üniversite,ilkokul,1,2,3,4,5,6,7,8
447,0,1,483278.5,37087.310094,27.73314,0,45.499872,0,0,1,...,1,0,0.019624,0.035843,0.027235,0.059362,0.00984,0.708084,0.12086,0.019153
4211,1,0,1000670.0,32009.536022,43.983499,0,111.382354,0,0,0,...,1,0,0.012314,0.005545,0.005104,0.874132,0.00813,0.022119,0.046,0.026657
2993,0,0,233688.1,3846.614717,10.005812,0,89.930217,0,0,1,...,0,0,0.07428,0.08056,0.013848,0.033275,0.727114,0.007857,0.032341,0.030725
1953,1,1,209071.6,5356.637656,8.878684,0,11.264575,0,1,0,...,0,1,0.701019,0.078683,0.108118,0.015665,0.057616,0.007509,0.023119,0.008272
4258,1,0,197642.7,4285.278539,10.715907,0,107.576176,0,0,0,...,0,0,0.084502,0.159285,0.04445,0.047348,0.516072,0.00742,0.076473,0.06445


In [10]:
class_names = ["1", "2", "3", "4", "5", "6", "7", "8"]
df3 = pd.DataFrame(rf_grid_model.predict_proba(test), columns = class_names)
df3.head()

Unnamed: 0,1,2,3,4,5,6,7,8
0,0.137104,0.048723,0.532411,0.018145,0.044963,0.07182,0.128272,0.018562
1,0.065636,0.048063,0.683541,0.024889,0.089127,0.008394,0.050266,0.030085
2,0.150031,0.619674,0.034021,0.009996,0.12002,0.015676,0.031139,0.019443
3,0.015288,0.035831,0.043736,0.037444,0.019286,0.784775,0.052482,0.011158
4,0.630292,0.018109,0.178524,0.014611,0.047295,0.022067,0.07398,0.015123


In [11]:
test["1"] = df3["1"].values
test["2"] = df3["2"].values
test["3"] = df3["3"].values
test["4"] = df3["4"].values
test["5"] = df3["5"].values
test["6"] = df3["6"].values
test["7"] = df3["7"].values
test["8"] = df3["8"].values
test.head()

Unnamed: 0,cinsiyet,medeni_durum,ort_gelir,ort_satın_alma,ort_adet,egitim_devam,ort_sepete_atma,ev_mobilya,giyim,kozmetik,...,üniversite,ilkokul,1,2,3,4,5,6,7,8
0,1,1,365331.930346,5566.587664,15.442635,0,19.64646,0,1,0,...,0,0,0.137104,0.048723,0.532411,0.018145,0.044963,0.07182,0.128272,0.018562
1,1,0,414899.236174,6756.766203,14.957246,0,19.675863,0,0,0,...,0,0,0.065636,0.048063,0.683541,0.024889,0.089127,0.008394,0.050266,0.030085
2,0,1,152825.872271,4493.915682,10.90793,0,46.904518,0,1,0,...,0,0,0.150031,0.619674,0.034021,0.009996,0.12002,0.015676,0.031139,0.019443
3,0,1,489052.908472,42670.646595,32.333021,0,48.083609,0,0,1,...,0,0,0.015288,0.035831,0.043736,0.037444,0.019286,0.784775,0.052482,0.011158
4,1,1,323635.711534,5959.266948,4.519968,0,12.752491,0,1,0,...,0,0,0.630292,0.018109,0.178524,0.014611,0.047295,0.022067,0.07398,0.015123


In [12]:
X_test["öbek"] = y_test.values
X_test.head()

Unnamed: 0,cinsiyet,medeni_durum,ort_gelir,ort_satın_alma,ort_adet,egitim_devam,ort_sepete_atma,ev_mobilya,giyim,kozmetik,...,ilkokul,1,2,3,4,5,6,7,8,öbek
447,0,1,483278.5,37087.310094,27.73314,0,45.499872,0,0,1,...,0,0.019624,0.035843,0.027235,0.059362,0.00984,0.708084,0.12086,0.019153,5
4211,1,0,1000670.0,32009.536022,43.983499,0,111.382354,0,0,0,...,0,0.012314,0.005545,0.005104,0.874132,0.00813,0.022119,0.046,0.026657,3
2993,0,0,233688.1,3846.614717,10.005812,0,89.930217,0,0,1,...,0,0.07428,0.08056,0.013848,0.033275,0.727114,0.007857,0.032341,0.030725,4
1953,1,1,209071.6,5356.637656,8.878684,0,11.264575,0,1,0,...,1,0.701019,0.078683,0.108118,0.015665,0.057616,0.007509,0.023119,0.008272,0
4258,1,0,197642.7,4285.278539,10.715907,0,107.576176,0,0,0,...,0,0.084502,0.159285,0.04445,0.047348,0.516072,0.00742,0.076473,0.06445,4


# Starting Model with Probas

In [13]:
X = X_test[["1","2","3","4","5","6","7","8"]]
y = X_test.öbek
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [15]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

eval_metric(rf, X_train, y_train, X_test, y_test)

Train Set
[[265   0   0   0   0   0   0   0]
 [  0 201   0   0   0   0   0   0]
 [  0   0 287   0   0   0   0   0]
 [  0   0   0 297   0   0   0   0]
 [  0   0   0   0 280   0   0   0]
 [  0   0   0   0   0 274   0   0]
 [  0   0   0   0   0   0 300   0]
 [  0   0   0   0   0   0   0 280]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       265
           1       1.00      1.00      1.00       201
           2       1.00      1.00      1.00       287
           3       1.00      1.00      1.00       297
           4       1.00      1.00      1.00       280
           5       1.00      1.00      1.00       274
           6       1.00      1.00      1.00       300
           7       1.00      1.00      1.00       280

    accuracy                           1.00      2184
   macro avg       1.00      1.00      1.00      2184
weighted avg       1.00      1.00      1.00      2184


Test Set
[[60  0  1  1  0  1  2  0]
 [ 0 59  0  0  0  1  

In [16]:
param_grid = {'n_estimators':[50, 64],  
             'max_features':[2, 5, 7],
             'max_depth':[3, 5],
             'min_samples_split':[2, 4],
             'min_samples_leaf':[3, 5]}

rf_grid_model = GridSearchCV(rf, param_grid, scoring = "accuracy", n_jobs = -1, verbose = 3).fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [17]:
eval_metric(rf_grid_model, X_train, y_train, X_test, y_test)

Train Set
[[256   0   3   2   1   2   1   0]
 [  0 184   2   1   3   5   4   2]
 [  4   0 275   0   1   4   3   0]
 [  3   0   6 273   5   4   1   5]
 [  4   0   1   1 268   3   0   3]
 [  1   3   0   1   1 261   4   3]
 [  1   2   9   2   1   2 280   3]
 [  1   1   3   4   2   0   0 269]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       265
           1       0.97      0.92      0.94       201
           2       0.92      0.96      0.94       287
           3       0.96      0.92      0.94       297
           4       0.95      0.96      0.95       280
           5       0.93      0.95      0.94       274
           6       0.96      0.93      0.94       300
           7       0.94      0.96      0.95       280

    accuracy                           0.95      2184
   macro avg       0.95      0.95      0.95      2184
weighted avg       0.95      0.95      0.95      2184


Test Set
[[60  0  1  1  0  1  2  0]
 [ 0 59  0  0  0  1  

In [19]:
rf = RandomForestClassifier(**rf_grid_model.best_params_)

rf.fit(X, y)

eval_metric(rf, X_train, y_train, X_test, y_test)

Train Set
[[256   0   3   2   1   2   1   0]
 [  0 184   2   1   3   5   4   2]
 [  4   0 275   0   1   4   3   0]
 [  3   0   6 273   5   4   1   5]
 [  4   0   1   1 268   3   0   3]
 [  1   3   0   1   1 261   4   3]
 [  1   2   9   2   1   2 280   3]
 [  1   1   3   4   2   0   0 269]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       265
           1       0.97      0.92      0.94       201
           2       0.92      0.96      0.94       287
           3       0.96      0.92      0.94       297
           4       0.95      0.96      0.95       280
           5       0.93      0.95      0.94       274
           6       0.96      0.93      0.94       300
           7       0.94      0.96      0.95       280

    accuracy                           0.95      2184
   macro avg       0.95      0.95      0.95      2184
weighted avg       0.95      0.95      0.95      2184


Test Set
[[60  0  1  1  0  1  2  0]
 [ 0 59  0  0  0  1  

In [20]:
rf.predict(test[["1","2","3","4","5","6","7","8"]]) + 1

array([3, 3, 2, ..., 7, 5, 6], dtype=int64)

In [21]:
sub_proba = pd.DataFrame()
sub_proba["id"] = test.index
sub_proba["Öbek İsmi"] = rf.predict(test[["1","2","3","4","5","6","7","8"]]) + 1

sub_proba["Öbek İsmi"].replace({1 : "obek_1", 2: "obek_2", 3 : "obek_3", 4 : "obek_4",
                 5 : "obek_5", 6 : "obek_6", 7 : "obek_7", 8 : "obek_8"}, inplace = True)

sub_proba.head()

Unnamed: 0,id,Öbek İsmi
0,0,obek_3
1,1,obek_3
2,2,obek_2
3,3,obek_6
4,4,obek_1


In [None]:
sub_proba.to_csv("proba_rf_submission.csv", index = False)