In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, log_loss
import optuna


class Train_Split:
    
    ROWID = ['f_0']
    DATE = ['f_1']
    CATEGORIES = [ f'f_{i}' for i in range(2,33) ]
    BINARY = [ f'f_{i}' for i in range(33,42) ]
    NUMERICAL = [ f'f_{i}' for i in range(42,80) ]
    IS_CLICKED = ['is_clicked']
    IS_INSTALLED =['is_installed']

    def __init__(self, val_type='random', class_type='binary',split_date=66,impute=True):

        print("Loading the data")
        self.data = pd.read_csv('../Data/miss_combine.csv')
        self.impute = impute
        self.val_type = val_type
        self.class_type = class_type
        self.split_date = split_date
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.impute_data()
        self.train_test_split()

    def impute_data(self):
        if self.impute:
            self.data['f_30'].fillna(self.data['f_30'].mode()[0],inplace=True)
            self.data['f_31'].fillna(self.data['f_31'].mode()[0],inplace=True)
            fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
            for f in tqdm(fmiss,desc="NUM IMPUTE"):
                self.data[f].fillna(self.data[f].mean(),inplace=True)

    def get_split(self):
        return self.X_train, self.X_test, self.y_train, self.y_test

    def train_test_split(self):
        print(f"Spliting the Data based on {self.val_type}")
        if self.val_type == 'random':
            self.random_split()
        elif self.val_type == 'time':
            self.time_split()
        elif self.val_type == 'No':
            self.final_split()
        else:
            raise Exception('Invalid validation type')
    
    # def get_label(self,data):
    #     '''
    #     data: Numpy array
    #     '''
    #     if self.class_type == 'binary':
    #         return data
    #     elif self.class_type == 'multi':
    #         labels = []
    #         for a, b in zip(data[:,0], data[:,1]):
    #             if a==0 and b==0:# None
    #                 labels.append(0)
    #             elif a==1 and b==0:# Clicked
    #                 labels.append(1)
    #             elif a==0 and b==1:# Installed
    #                 labels.append(2)
    #             elif a==1 and b==1:# Clicked and Installed
    #                 labels.append(3)
    #         return np.array(labels)

    def random_split(self):
        """
        Randomly split the data into train and test set
        """
        y = self.data[Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED]
        X = self.data.drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    def time_split(self):
        """
        Split the data into train and test set based on Date
        """
        self.X_train = self.data[self.data[Train_Split.DATE[0]] < self.split_date ].drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1)
        self.X_test = self.data[self.data[Train_Split.DATE[0]] >= self.split_date ].drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1)
        self.y_train = self.data[self.data[Train_Split.DATE[0]] < self.split_date ][Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED]
        self.y_test = self.data[self.data[Train_Split.DATE[0]] >= self.split_date ][Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED]
        print(f"X_train:{self.X_train.shape}, X_test:{self.X_test.shape} , y_train:{self.y_train.shape} , y_test:{self.y_test.shape}")

    def final_split(self):
        self.X_train = self.data.drop(Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED, axis=1).values
        self.y_train = self.get_label(self.data[Train_Split.IS_CLICKED + Train_Split.IS_INSTALLED].values)
        self.X_test = None
        self.y_test = None

In [2]:
train = Train_Split(val_type='time',class_type='binary',split_date=66,impute=True)
X_train, X_test, y_train, y_test = train.get_split()

Loading the data


NUM IMPUTE: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 46.73it/s]


Spliting the Data based on time
X_train:(3387880, 80), X_test:(97972, 80) , y_train:(3387880, 2) , y_test:(97972, 2)


In [4]:
import gc
gc.collect()

0

In [5]:
use_features = Train_Split.CATEGORIES + Train_Split.NUMERICAL + Train_Split.BINARY

In [6]:
model_for = 'install'

In [7]:
def objective(trail):
    params = {
        'max_depth':trail.suggest_int('max_depth',5,8),
        'learning_rate':trail.suggest_float('learning_rate',0.05,0.3),
        'n_estimators':trail.suggest_int('n_estimators',100,650),
        # 'gamma':trail.suggest_loguniform('gamma',0.01,1),
        # 'colsample_bytree':trail.suggest_loguniform('colsample_bytree',0.7,1),
        'tree_method':'gpu_hist',
        'objective':'binary:logistic'
    }
    target = Train_Split.IS_CLICKED[0]
    if model_for == 'install':
        target = Train_Split.IS_INSTALLED[0]
    # print(target)
    model = XGBClassifier(**params)
    # print("Training the model")
    model.fit(X_train[use_features],y_train[target])
    # print("Training Done")
    y_pred = model.predict(X_test[use_features])
    f1 = log_loss(y_test[target],y_pred)
    # print(f"F1 Score: {f1}")
    # print(gc.collect())
    return f1

In [None]:
model_for = 'install'
print(f"The MODEL is For :{model_for}")
study_install = optuna.create_study(direction='minimize',study_name='install_log_loss')
study_install.optimize(objective, n_trials=30)

[32m[I 2023-05-20 17:11:06,280][0m A new study created in memory with name: install_log_loss[0m


The MODEL is For :install


[32m[I 2023-05-20 17:11:20,733][0m Trial 0 finished with value: 6.137694720562499 and parameters: {'max_depth': 7, 'learning_rate': 0.16102449679449166, 'n_estimators': 140}. Best is trial 0 with value: 6.137694720562499.[0m
[32m[I 2023-05-20 17:11:38,516][0m Trial 1 finished with value: 6.101020231715404 and parameters: {'max_depth': 6, 'learning_rate': 0.07311763789389246, 'n_estimators': 297}. Best is trial 1 with value: 6.101020231715404.[0m
[32m[I 2023-05-20 17:11:58,880][0m Trial 2 finished with value: 6.1666035070336465 and parameters: {'max_depth': 8, 'learning_rate': 0.1463907261753371, 'n_estimators': 295}. Best is trial 1 with value: 6.101020231715404.[0m
[32m[I 2023-05-20 17:12:27,591][0m Trial 3 finished with value: 6.024532485108871 and parameters: {'max_depth': 7, 'learning_rate': 0.1325302878245357, 'n_estimators': 577}. Best is trial 3 with value: 6.024532485108871.[0m
[32m[I 2023-05-20 17:12:57,344][0m Trial 4 finished with value: 5.971996694262477 and p

In [None]:
model_for = 'click'
study_click = optuna.create_study(direction='minimize',study_name=model_for)
study_click.optimize(objective, n_trials=30)

In [None]:
X = pd.concat([X_train,X_test])

In [None]:
y = pd.concat([y_train,y_test])

In [None]:
params = study_click.best_params
params['tree_method']= 'gpu_hist'
params['objective'] = 'binary:logistic'
clk_model = XGBClassifier(**params)
clk_model.fit(X[use_features],y[Train_Split.IS_CLICKED])

In [None]:
params = study_install.best_params
params['tree_method']= 'gpu_hist'
params['objective'] = 'binary:logistic'
install_model = XGBClassifier(**params)
install_model.fit(X[use_features],y[Train_Split.IS_INSTALLED])

In [None]:
test = pd.read_csv('../Data/test/000000000000.csv',sep='\t')

In [None]:
test['f_30'].fillna(test['f_30'].mode()[0],inplace=True)
test['f_31'].fillna(test['f_31'].mode()[0],inplace=True)

In [None]:
fmiss = "f_43,f_51,f_58,f_59,f_64,f_65,f_66,f_67,f_68,f_69,f_70".split(',')
for f in fmiss:
    test[f].fillna(test[f].mean(),inplace=True)

In [None]:
y_click_test = clk_model.predict_proba(test[use_features])

In [None]:
y_install_test = install_model.predict_proba(test[use_features])

In [None]:
import numpy as np
result = np.vstack([test['f_0'].to_numpy(dtype=int),y_click_test[:,1],y_install_test[:,1]]).T

In [None]:
result.shape

In [None]:
final = pd.DataFrame(result,columns=['RowId','is_clicked','is_installed'])

In [None]:
final.head()

In [None]:
final['RowId']=final['RowId'].astype('int')

In [None]:
final.to_csv('../Data/final_results/xgb_optuna.csv', sep ='\t', index=False)