In [None]:
import time
import shap
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from catboost import CatBoostClassifier
import optuna

warnings.filterwarnings("ignore")

# DATA PREPROCESSING

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
customer = pd.read_csv("data/customer.csv")
customeraccount = pd.read_csv("data/customeraccount.csv")
genel_kategoriler = pd.read_csv("data/genel_kategoriler.csv")
product_groups = pd.read_csv("data/product_groups.csv")
transaction_header = pd.read_csv("data/transaction_header.csv")
transaction_sale = pd.read_csv("data/transaction_sale.csv")

In [None]:
train

In [None]:
customer

In [None]:
t_c = pd.merge(train, customer, how="left")
t_c

In [None]:
genel_kategoriler

In [None]:
t_c_gk = pd.merge(t_c, genel_kategoriler, how="left")
t_c_gk

In [None]:
product_groups

In [None]:
def quantile25(data):
    return data.quantile(0.25)
def quantile75(data):
    return data.quantile(0.75)

aggregator_dict = {}
for idx, val in enumerate(product_groups.columns[1:]):
    aggregator_dict[val] = ["min", quantile25, "median", quantile75, "max", "mean", "std"]

pg_grpd = product_groups.groupby('category_number')
pg_grpd_agg = pg_grpd.agg(aggregator_dict)
pg_grpd_agg = pg_grpd_agg.reset_index()
pg_grpd_agg

In [None]:
columns = []
for col in pg_grpd_agg.columns:
    columns.append("-".join(col))
columns

In [None]:
columns[0] = columns[0][:-1]
pg_grpd_agg.columns = columns
pg_grpd_agg

In [None]:
t_c_gk_pg = pd.merge(t_c_gk, pg_grpd_agg, how = "left")
t_c_gk_pg

In [None]:
transaction_header

In [None]:
ca_th = pd.merge(customeraccount, transaction_header, how = "left")
ca_th

In [None]:
ca_th_ts = pd.merge(ca_th, transaction_sale, how = "left")
ca_th_ts

In [None]:
aggregator_dict = {}
agg_cols = ["is_sanal", "category_level_1", "category_level_2", "category_level_3", "category_level_4",
           "amount", "quantity", "discount_type_1", "discount_type_2", "discount_type_3"]
for idx, val in enumerate(agg_cols):
    aggregator_dict[val] = ["min", quantile25, "median", quantile75, "max", "mean", "std"]

ca_th_ts_grpd = ca_th_ts.groupby('individualnumber')
ca_th_ts_grpd_agg = ca_th_ts_grpd.agg(aggregator_dict)
ca_th_ts_grpd_agg = ca_th_ts_grpd_agg.reset_index()
ca_th_ts_grpd_agg

In [None]:
columns = []
for col in ca_th_ts_grpd_agg.columns:
    columns.append("_".join(col))
columns

In [None]:
columns[0] = columns[0][:-1]
ca_th_ts_grpd_agg.columns = columns
ca_th_ts_grpd_agg

In [None]:
train = pd.merge(t_c_gk_pg, ca_th_ts_grpd_agg, how = "left", on = "individualnumber")
train

In [None]:
X = train.drop(["individualnumber", "response"], axis = 1)
X

In [None]:
y = train["response"]
y

In [None]:
print(train["response"].value_counts())
print(y.value_counts())

In [None]:
X.to_csv("X.csv", index=False)
y.to_csv("y.csv", index=False)

In [None]:
X = pd.read_csv("X.csv")
y = pd.read_csv("y.csv")

In [None]:
lbl = LabelEncoder() 
X['gender'] = lbl.fit_transform(X['gender'].astype(str))
X['genel_kategori'] = lbl.fit_transform(X['genel_kategori'].astype(str))  
X

# MODEL SELECTION

In [None]:
f1_scores = []

for _ in range(10):
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)
    lgbmc_base = LGBMClassifier()
    lgbmc_base.fit(X_train, y_train.values.ravel())
    y_pred = lgbmc_base.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)
    
np.mean(f1_scores)

In [None]:
f1_scores = []

for _ in range(10):
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)
    xgbc_base = XGBClassifier()
    xgbc_base.fit(X_train, y_train)
    y_pred = xgbc_base.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)
    
np.mean(f1_scores)

In [None]:
f1_scores = []

for _ in range(10):
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)
    cb_base = CatBoostClassifier(verbose = 0)
    cb_base.fit(X_train, y_train)
    y_pred = cb_base.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    f1_scores.append(f1)
    
np.mean(f1_scores)

# HYPERPARAMETER TUNING

In [None]:
def objective(trial):
    params = {
        'metric': 'mlogloss', 
        'n_estimators': trial.suggest_int('num_leaves', 1000, 10000),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100),
         "scale_pos_weight": trial.suggest_int('scale_pos_weight', 3, 10),
    }
        
    f1_scores = []

    for _ in range(5):
        X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)
        lgbmc = LGBMClassifier(**params)
        lgbmc.fit(X_train, y_train.values.ravel())
        y_pred = lgbmc.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)
        
    return np.mean(f1_scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)

In [None]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        "scale_pos_weight": trial.suggest_int('scale_pos_weight', 3, 10),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }
        
    f1_scores = []

    for _ in range(5):
        X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)
        xgbc = XGBClassifier(**params)
        xgbc.fit(X_train, y_train)
        y_pred = xgbc.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)
        
    return np.mean(f1_scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)

In [None]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.1, 20.0),
        'random_strength': trial.suggest_float('random_strength', 1.0, 2.0),
        'depth': trial.suggest_int('depth', 1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300),
         "scale_pos_weight": trial.suggest_int('scale_pos_weight', 3, 10),
        "verbose" : 0
    }
        
    f1_scores = []

    for _ in range(5):
        X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2)
        cbc = CatBoostClassifier(**params)
        cbc.fit(X_train, y_train)
        y_pred = cbc.predict(X_val)
        f1 = f1_score(y_val, y_pred)
        f1_scores.append(f1)
        
    return np.mean(f1_scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

# FEATURE IMPORTANCE

In [None]:
xgbc_final = XGBClassifier(**trial.params)
xgbc_final.fit(X, y)

plt.rcParams["figure.figsize"] = (14, 24)
plot_importance(xgbc_final)
plt.show()

In [None]:
explainerxgbc = shap.TreeExplainer(xgbc_final)
shap_values_XGBoost_train = explainerxgbc.shap_values(X)
shap.summary_plot(shap_values_XGBoost_train, X)

# PREPARE AND PREDICT TEST DATA

In [None]:
test

In [None]:
test_c = pd.merge(test, customer, how="left")
test_c

In [None]:
test_c_gk = pd.merge(test_c, genel_kategoriler, how="left")
test_c_gk

In [None]:
test_c_gk = pd.merge(test_c_gk, pg_grpd_agg, how = "left")
test_c_gk

In [None]:
test = pd.merge(test_c_gk, ca_th_ts_grpd_agg, how = "left", on = "individualnumber")
test

In [None]:
X_test = test.drop(["individualnumber"], axis = 1)
X_test

In [None]:
lbl = LabelEncoder() 
X_test['gender'] = lbl.fit_transform(X_test['gender'].astype(str))
X_test['genel_kategori'] = lbl.fit_transform(X_test['genel_kategori'].astype(str))  
X_test

In [None]:
y_test_pred = xgbc_final.predict(X_test)

# CREATING SUBMISSION

In [None]:
submission = pd.DataFrame({"individualnumber":test["individualnumber"]})
submission['response'] = y_test_pred
submission.response = submission.response.astype(int)
submission

In [None]:
submission.to_csv(f"Submission-{time.strftime('%d - %H-%M-%S')}.csv", index=False)