In [75]:
!pip install catboost



In [76]:
from pathlib import Path
from warnings import simplefilter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

simplefilter('ignore')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

plt.style.use('ggplot')
plt.rc('figure',autolayout=True, figsize=(11,4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=18,
    titlepad=10
)

In [77]:
data_dir = Path('data')

df = pd.read_parquet(
    data_dir / 'train_final.parquet'
)

test = pd.read_parquet(
    data_dir / 'test_final.parquet'
)

ss = pd.read_parquet(
    data_dir / 'submission_sample_final.parquet'
)

### Basic FE

#### Binarize Targets

In [78]:

def binarize_targets(dataframe,menu_count = 9):
  unique_menus = ['menu' + str(i) for i in range(1,menu_count + 1)]
  dataframe[unique_menus] = 0
  for i,row in df.iterrows():
    menus = row['target'].split(', ')
    for menu in menus:
        if menu in unique_menus:
            dataframe.loc[i,menu] = 1

  df.drop('target',axis=1,inplace=True)

  return df

df = binarize_targets(dataframe=df)

#### Normalize N Seconds

In [79]:
def normalize_n_seconds(dataframe):
  dataframe['n_seconds'] = dataframe[['n_seconds_1','n_seconds_2','n_seconds_3']].sum(axis=1)

  for col in ['n_seconds_1','n_seconds_2','n_seconds_3']:
      dataframe[col] = dataframe[col] / dataframe['n_seconds']

  return dataframe

df = normalize_n_seconds(dataframe=df)

#### Device Brand & Device Brand

In [80]:
def change_carrier(carrier):
  if carrier not in ['TURKCELL','VODAFONE TR', 'TURK TELEKOM']:
    carrier = 'OTHER'

  return carrier

def change_device_brand(devicebrand):
  if devicebrand != 'Apple':
    devicebrand = 'Android'

  return devicebrand

df['carrier'] = df['carrier'].apply(change_carrier)
df['devicebrand'] = df['devicebrand'].apply(change_device_brand)

#### Age Groups

In [81]:
def create_age_groups(dataframe):
  bins = [20,30,40,50,60,70,120]
  labels = ['20-29','30-39','40-49','50-59','60-69','70+']
  dataframe['age_group'] = pd.cut(dataframe.feature_49, bins, labels=labels, include_lowest=True)

  return dataframe

df = create_age_groups(dataframe=df)

### Train Test Split

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

targets = ['menu1','menu2','menu3','menu4','menu5','menu6','menu7','menu8','menu9']

X = df.drop(targets + ['id'],axis=1)
y = df[targets]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    stratify=df['age_group']
)

print(f'Features: {features}')
print(f'X Train set shape: {X_train.shape} \nX Test set shape: {X_test.shape}')
print(f'y Train set shape: {y_train.shape} \ny Test set shape: {y_test.shape}')

Features: Index(['month', 'n_seconds_1', 'n_seconds_2', 'n_seconds_3', 'carrier',
       'devicebrand', 'feature_0', 'feature_1', 'feature_2', 'feature_3',
       'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8',
       'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13',
       'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18',
       'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23',
       'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28',
       'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33',
       'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38',
       'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43',
       'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48',
       'feature_49', 'n_seconds', 'age_group'],
      dtype='object')
X Train set shape: (75239, 58) 
X Test set shape: (18810, 58)
y Train set shape: (75239

#### One Hot Encoding

In [83]:
ohe_cols = ['devicebrand','carrier','age_group']
ohe = OneHotEncoder(handle_unknown='ignore')

ct = ColumnTransformer([
    ('ohe',ohe,ohe_cols)
],remainder='passthrough', n_jobs=-1)

encoder = ct.fit(X_train)

X_train =  encoder.transform(X_train)
X_test = encoder.transform(X_test)

### Modelling

In [84]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier


xgb_clf = XGBClassifier(tree_method='hist')
xgb_clf.fit(X_train,y_train)
y_proba = xgb_clf.predict(X_test)

print(y_proba)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 1. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]
