In [1]:
!pip install catboost



You should consider upgrading via the 'C:\Users\Fikri\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
from pathlib import Path
from warnings import simplefilter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

simplefilter('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

plt.style.use('ggplot')
plt.rc('figure',autolayout=True, figsize=(11,4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=18,
    titlepad=10
)

In [3]:
data_dir = Path('../data')

df_train = pd.read_parquet(
    data_dir / 'train_final.parquet'
)

df_test = pd.read_parquet(
    data_dir / 'test_final.parquet'
)

ss = pd.read_parquet(
    data_dir / 'submission_sample_final.parquet'
)

### Basic FE

In [4]:
def binarize_targets(dataframe,menu_count = 9):
  unique_menus = ['menu' + str(i) for i in range(1,menu_count + 1)]
  dataframe[unique_menus] = 0
  for i,row in dataframe.iterrows():
    menus = row['target'].split(', ')
    for menu in menus:
        if menu in unique_menus:
            dataframe.loc[i,menu] = 1

  dataframe.drop('target',axis=1,inplace=True)

  return dataframe

def month_as_category(dataframe):
   dataframe['month'] = dataframe['month'].astype('object')

   return dataframe

def normalize_n_seconds(dataframe):
  dataframe['n_seconds'] = dataframe[['n_seconds_1','n_seconds_2','n_seconds_3']].sum(axis=1)

  for col in ['n_seconds_1','n_seconds_2','n_seconds_3']:
      dataframe[col] = dataframe[col] / dataframe['n_seconds']

  return dataframe


def change_carrier(carrier):
  if carrier not in ['TURKCELL','VODAFONE TR', 'TURK TELEKOM']:
    carrier = 'OTHER'

  return carrier

def change_device_brand(devicebrand):
  if devicebrand != 'Apple':
    devicebrand = 'Android'

  return devicebrand

def create_age_groups(dataframe):
  bins = [20,30,40,50,60,70,120]
  labels = ['20-29','30-39','40-49','50-59','60-69','70+']
  dataframe['age_group'] = pd.cut(dataframe.feature_49, bins, labels=labels, include_lowest=True)

  return dataframe

def encode_categoricals(dataframe,encoder,categorical_features):

   encoded_data = encoder.transform(dataframe[categorical_features])
   df_encoded = pd.concat(
      [
         dataframe.drop(categorical_features,axis=1),
         pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_features))
      ],
      axis=1
   )
   df_encoded[encoder.get_feature_names_out(categorical_features)] = df_encoded[encoder.get_feature_names_out(categorical_features)].astype(int)
   return df_encoded

df_train = binarize_targets(dataframe=df_train)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

for df in [df_train,df_test]:
  df = month_as_category(dataframe=df)
  df = normalize_n_seconds(dataframe=df)
  df['carrier'] = df['carrier'].apply(change_carrier)
  df['devicebrand'] = df['devicebrand'].apply(change_device_brand)
  df = create_age_groups(dataframe=df)


categorical_features = ['month','carrier','devicebrand','age_group']
encoder.fit(df_train[categorical_features])

df_train = encode_categoricals(dataframe=df_train, encoder=encoder,categorical_features=categorical_features)
df_test = encode_categoricals(dataframe=df_test, encoder=encoder,categorical_features=categorical_features)

print(f'Shape DF Train {df_train.shape}, Shape DF Test {df_test.shape}')

Shape DF Train (94049, 79), Shape DF Test (11955, 70)


### Train Test Split

In [5]:
targets = ['menu1','menu2', 'menu3', 'menu4', 'menu5', 'menu6', 'menu7', 'menu8', 'menu9']

X = df_train.drop(targets + ['id'], axis=1)
y = df_train[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0
)

print(f'X_train shape {X_train.shape}, y_train shape {y_train.shape}')
print(f'X_test shape {X_test.shape}, y_test shape {y_test.shape}')

X_train shape (75239, 69), y_train shape (75239, 9)
X_test shape (18810, 69), y_test shape (18810, 9)


### Modelling

In [6]:
xgb_clf = XGBClassifier(tree_method='hist')
xgb_clf.fit(X_train,y_train)  

In [7]:
def binarize_preds(preds):
    sorted_indices = np.argsort(preds, axis=1)[:, ::-1]
    binary_array = np.zeros_like(preds)
    
    for row, indices in enumerate(sorted_indices[:, :3]):
        binary_array[row, indices] = 1

    return binary_array


y_proba = xgb_clf.predict_proba(X_train)

binarized_preds = pd.DataFrame(binarize_preds(preds=y_proba),columns=y_train.columns, index=y_train.index)


In [11]:
binarized_preds[targets] = binarized_preds[targets].astype(int)

binarized_preds.head()

Unnamed: 0,menu1,menu2,menu3,menu4,menu5,menu6,menu7,menu8,menu9
26543,0,1,0,1,0,0,0,0,1
61839,0,1,0,0,0,1,0,0,1
84693,0,1,0,0,0,1,0,1,0
39528,0,1,0,1,0,1,0,0,0
8287,0,1,0,1,0,1,0,0,0


In [12]:
y_train.head()

Unnamed: 0,menu1,menu2,menu3,menu4,menu5,menu6,menu7,menu8,menu9
26543,0,1,0,1,0,0,0,0,1
61839,0,1,0,0,0,1,0,0,1
84693,0,1,0,0,0,1,0,1,0
39528,0,1,0,1,0,0,0,0,1
8287,0,1,0,1,0,1,0,0,0
