<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Loading-train-data" data-toc-modified-id="Loading-train-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Loading train data</a></span><ul class="toc-item"><li><span><a href="#Merging-train-and-test-data" data-toc-modified-id="Merging-train-and-test-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Merging train and test data</a></span></li><li><span><a href="#Merging-user-data" data-toc-modified-id="Merging-user-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Merging user data</a></span></li><li><span><a href="#Competitions-based-features" data-toc-modified-id="Competitions-based-features-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Competitions based features</a></span></li><li><span><a href="#Competitions-data" data-toc-modified-id="Competitions-data-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Competitions data</a></span></li><li><span><a href="#Time-based-competitions-features" data-toc-modified-id="Time-based-competitions-features-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Time based competitions features</a></span></li><li><span><a href="#Current-active-competitions-feature" data-toc-modified-id="Current-active-competitions-feature-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Current active competitions feature</a></span></li><li><span><a href="#User-Interests-Feature" data-toc-modified-id="User-Interests-Feature-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>User Interests Feature</a></span></li><li><span><a href="#Submissions-based-features" data-toc-modified-id="Submissions-based-features-1.8"><span class="toc-item-num">1.8&nbsp;&nbsp;</span>Submissions based features</a></span></li><li><span><a href="#Discussion-based-features" data-toc-modified-id="Discussion-based-features-1.9"><span class="toc-item-num">1.9&nbsp;&nbsp;</span>Discussion based features</a></span></li><li><span><a href="#Comments-based-features" data-toc-modified-id="Comments-based-features-1.10"><span class="toc-item-num">1.10&nbsp;&nbsp;</span>Comments based features</a></span></li></ul></li><li><span><a href="#Modeling" data-toc-modified-id="Modeling-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Modeling</a></span></li><li><span><a href="#Submission" data-toc-modified-id="Submission-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Submission</a></span></li></ul></div>

In [None]:
# !pip install -r requirements.txt

In [None]:
%load_ext autoreload
%autoreload 2

import os
import gc
import sys
import random
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option("max_colwidth", None)
pd.set_option("max_columns", 500)
pd.set_option("max_rows", 500)

plt.style.use('fivethirtyeight')
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 14
plt.rcParams["ytick.labelsize"] = 14

from sklearn.model_selection import StratifiedKFold, GroupKFold
from catboost import CatBoostClassifier
from category_encoders import CountEncoder
import lightgbm as lgb

%matplotlib inline

In [None]:
class Config:
    LAG = 3
    VER = f'final_sub_v1'
    OUTPUT_DIR = './outputs'
    DATA_DIR = './data'
    DEBUG = True
    N_SPLITS = 5

In [None]:
LOCAL_TEST_RUN = False

def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)

seed_everything()

# Loading train data

In [None]:
def determine_target(df):
    new_target = []
    for i, row in df.iterrows():
        if row['CompPart'] == 1:
            new_target.append('CompPart')
            continue
        elif row['Sub'] == 1 or row['Comment'] == 1 or row['Disc'] == 1:
            new_target.append('Sub')
            continue
        else:
            new_target.append('NoActivity')
        
    return new_target

In [None]:
train = pd.read_csv(os.path.join(Config.DATA_DIR,"Train.csv"), index_col=None)
print(train.shape)
train['Target'] = determine_target(train)
train.head()

(259832, 8)


Unnamed: 0,User_ID,month,year,CompPart,Comment,Sub,Disc,Target
0,ID_XI7BAR4Y,8,3,0,0,0,0,NoActivity
1,ID_XI7BAR4Y,8,2,0,0,0,0,NoActivity
2,ID_XI7BAR4Y,9,2,0,0,0,0,NoActivity
3,ID_XI7BAR4Y,9,3,0,0,0,0,NoActivity
4,ID_XI7BAR4Y,10,3,0,0,0,0,NoActivity


In [None]:
if LOCAL_TEST_RUN:
    test_index = (train['year']==3)&(train['month'].isin([10,11,12]))
    test = train[test_index].reset_index(drop=True)
    train = train[~test_index].reset_index(drop=True)
else:
    test = pd.read_csv(os.path.join(Config.DATA_DIR,"Test.csv"), index_col=None)

print(test.shape)
test.head()

(65223, 3)


Unnamed: 0,User_ID,month,year
0,ID_H1ELY25E,1,4
1,ID_H1ELY25E,2,4
2,ID_H1ELY25E,3,4
3,ID_463Q2BCO,1,4
4,ID_463Q2BCO,2,4


## Merging train and test data

In [None]:
def determine_timestamp(df):
    df['year_month'] = (
        df['year'].astype(str) +
        df['month'].apply(lambda x: str(x).zfill(2))
    ).astype(int)
    df = df.sort_values(by='year_month').reset_index(drop=True)
    df['timestamp'] = np.arange(1, len(df) + 1)

    return df

print(train.shape, test.shape)
train['is_train'] = 1
test['is_train'] = 0

overall = train.append(test, ignore_index=True)
timestamp = overall[['year', 'month']].drop_duplicates()
timestamp = determine_timestamp(timestamp)
overall = overall.merge(timestamp, how='left')
all_timestamps = overall[['User_ID', 'timestamp', 'year', 'month']].drop_duplicates().reset_index(drop=True)

overall = overall.sort_values(by='timestamp').reset_index(drop=True)
overall['Record'] = 1
overall['Total_Num_User_Months'] = overall.groupby('User_ID')['Record'].apply(lambda x: x.cumsum())

(259832, 8) (65223, 3)


## Merging user data

In [None]:
users = pd.read_csv(os.path.join(Config.DATA_DIR,"Users.csv"), index_col=None)
users.columns = ['User_ID', 'FeatureX', 'Country', 'FeatureY', 'Points', 'year', 'month', 'dayofweek']
users = users.merge(timestamp, how='left')
users = users.rename(columns={"timestamp": "Zindi_Joining_Timestamp"})
users.drop(['dayofweek', 'year', 'month', 'year_month'], axis=1, inplace=True)
users.head()

Unnamed: 0,User_ID,FeatureX,Country,FeatureY,Points,Zindi_Joining_Timestamp
0,ID_N5LTBAPU,0,ID_DMRM,1,group 3,13
1,ID_CLSFQB0S,0,ID_Q02,3,group 3,2
2,ID_RE6T58Y4,0,ID_Q02,0,group 3,21
3,ID_XJQQRJV3,0,ID_Z8BI,0,group 3,18
4,ID_1JHU6A8S,0,ID_Q02,3,group 3,19


In [None]:
overall = overall.merge(users, how='left')

sel_cols = ['FeatureX', 'Country', 'FeatureY', 'Points']
overall[sel_cols] = overall[sel_cols].astype(str)

## Competitions based features

In [None]:
usr_comp = pd.read_csv(os.path.join(Config.DATA_DIR,"CompetitionPartipation.csv"), index_col=None)
usr_comp.columns = ['CompID', 'User_ID', 'PublicRank', 'Successful_Sub_Count',
                    'year', 'month', 'dayofweek']
if LOCAL_TEST_RUN:
    test_index = (usr_comp['year']==3)&(usr_comp['month'].isin([10,11,12]))
    usr_comp = usr_comp[~test_index].reset_index(drop=True)
    
usr_comp_timestamp = usr_comp.merge(timestamp, how='left')
usr_comp_timestamp = usr_comp_timestamp[['User_ID', 'month', 'year', 'timestamp']].drop_duplicates()
usr_comp_timestamp.columns = ['User_ID', 'month', 'year', 'comp_timestamp']
overall = overall.merge(usr_comp_timestamp, how='left')

overall = overall.sort_values(by='timestamp').reset_index(drop=True)
overall['comp_timestamp'] = overall.groupby('User_ID')['comp_timestamp'].apply(lambda x: x.ffill().shift())
overall['Months_Since_Last_Comp'] = overall['timestamp'] - overall['comp_timestamp']
overall['Months_Since_Joining_Zindi'] = overall['comp_timestamp'] - overall['Zindi_Joining_Timestamp']

## Competitions data

In [None]:
competitions = pd.read_csv(os.path.join(Config.DATA_DIR,"Competitions.csv"),
                           index_col=None,
                           skipinitialspace=True)
competitions['CompEndTime Year'] = [int(val) if val!='not mapped' else 999 for val in competitions['CompEndTime Year']]
competitions['FeatureC'] = competitions['FeatureC'].fillna(-1).astype(np.int8)
competitions = competitions.merge(
    timestamp,
    left_on=['CompStartTime Year', 'CompStartTime Month'],
    right_on=['year', 'month'],
    how='left')
competitions = competitions.rename(columns={
    'timestamp': 'comp_start_timestamp',
})
competitions.drop(['year', 'month', 'year_month'], axis=1, inplace=True)
competitions = competitions.merge(
    timestamp,
    left_on=['CompEndTime Year', 'CompEndTime Month'],
    right_on=['year', 'month'],
    how='left')
competitions = competitions.rename(columns={
    'timestamp': 'comp_end_timestamp',
})
competitions['comp_end_timestamp'] = competitions['comp_end_timestamp'].fillna(99)
competitions.drop(['year', 'month', 'year_month'], axis=1, inplace=True)
competitions['comp_duration'] = competitions['comp_end_timestamp'] - competitions['comp_start_timestamp']

In [None]:
import ast
for col in ['FeatureA', 'FeatureB', 'FeatureE']:
    competitions[col] = competitions[col].map(ast.literal_eval)

In [None]:
comp_features = competitions[['CompID']].copy()
for col in ['FeatureA', 'FeatureB', 'FeatureC', 'FeatureD', 'FeatureE']:
    tmp = competitions[['CompID', col]].explode(column=[col])
    tmp[col] = tmp[col].fillna('empty')
    tmp['count'] = 1

    tmp = tmp.pivot_table(index='CompID', 
                    columns=col,
                    values='count',
                    aggfunc='count')
    tmp.columns = [tmp.columns.name + "_" + str(col) for col in tmp.columns]
    tmp = tmp.reset_index()
    comp_features = comp_features.merge(tmp, how='left')
comp_features = comp_features.fillna(0)
comp_features = comp_features.merge(competitions[['CompID', 'comp_start_timestamp']])

## Time based competitions features

In [None]:
timestamp_ids = []
competitions_ids = []
for t in tqdm(timestamp.timestamp):
    selected_comp = competitions.CompID[(t>=competitions.comp_start_timestamp)&(t<=competitions.comp_end_timestamp)]
    timestamp_ids.extend([t]*len(selected_comp))
    competitions_ids.extend(selected_comp)

timestamp_comp = pd.DataFrame({
    "timestamp": timestamp_ids,
    "CompID": competitions_ids
})

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 2398.73it/s]


## Current active competitions feature

In [None]:
active_comp = usr_comp.merge(timestamp, how='left')
active_comp = active_comp.rename(columns={"timestamp": "comp_timestamp"})
active_comp = active_comp[['User_ID', 'CompID', 'comp_timestamp']].merge(competitions[['CompID', 'comp_start_timestamp', 'comp_end_timestamp']], how='left')
active_comp = active_comp[active_comp['comp_end_timestamp']!=99].reset_index(drop=True)
active_comp = all_timestamps.merge(active_comp, how='left')

active_comp['Current_Active_Competitions'] = (
    (active_comp['timestamp'] > active_comp['comp_timestamp']) &
    (active_comp['timestamp'] <= active_comp['comp_end_timestamp'])
).astype(np.int8)

active_comp = active_comp.groupby(['User_ID', 'timestamp'])['Current_Active_Competitions'].sum()
active_comp = active_comp.reset_index()

overall = overall.merge(active_comp, how='left')

## User Interests Feature

In [None]:
timestamp_comp = timestamp_comp.merge(comp_features, how='left')
timestamp_comp.drop(['comp_start_timestamp', 'CompID'], axis=1, inplace=True)
timestamp_comp = timestamp_comp.groupby('timestamp').agg(np.sum).reset_index()
timestamp_comp = timestamp_comp.sort_values('timestamp').reset_index(drop=True)

usr_comp_timestamp = usr_comp.merge(timestamp, how='left')
usr_comp_features = usr_comp_timestamp[['User_ID', 'CompID', 'timestamp']].merge(comp_features, how='left')
usr_comp_features = usr_comp_features.drop(['CompID', 'comp_start_timestamp'], axis=1)
usr_comp_features = usr_comp_features.groupby(['User_ID', 'timestamp']).agg(np.sum)#.groupby(level=0).cumsum()
usr_comp_features = usr_comp_features.reset_index()
usr_comp_features = all_timestamps.merge(usr_comp_features, how='left')
sel_cols = usr_comp_features.columns[4:]
print(sel_cols)

usr_comp_features = usr_comp_features.sort_values(by='timestamp').reset_index(drop=True)
for col in tqdm(sel_cols):
    usr_comp_features[col] = usr_comp_features.groupby('User_ID')[col].apply(lambda x: x.ffill())

usr_comp_features = usr_comp_features.fillna(0)

Index(['FeatureA_1', 'FeatureA_2', 'FeatureA_3', 'FeatureA_4', 'FeatureA_5',
       'FeatureA_6', 'FeatureA_7', 'FeatureA_8', 'FeatureA_9', 'FeatureA_10',
       'FeatureA_empty', 'FeatureB_5', 'FeatureB_6', 'FeatureB_7',
       'FeatureB_8', 'FeatureB_9', 'FeatureB_10', 'FeatureB_12', 'FeatureB_14',
       'FeatureB_15', 'FeatureB_16', 'FeatureB_empty', 'FeatureC_-1',
       'FeatureC_1', 'FeatureC_2', 'FeatureC_3', 'FeatureC_4', 'FeatureC_5',
       'FeatureC_6', 'FeatureC_7', 'FeatureC_8', 'FeatureC_9', 'FeatureC_10',
       'FeatureC_11', 'FeatureC_12', 'FeatureC_13', 'FeatureC_14',
       'FeatureC_15', 'FeatureC_16', 'FeatureC_17', 'FeatureC_18',
       'FeatureC_19', 'FeatureC_20', 'FeatureC_21', 'FeatureC_22',
       'FeatureC_23', 'FeatureC_24', 'FeatureC_25', 'FeatureC_26',
       'FeatureC_27', 'FeatureC_28', 'FeatureC_29', 'FeatureC_30',
       'FeatureC_31', 'FeatureC_32', 'FeatureC_33', 'FeatureC_34',
       'FeatureC_35', 'FeatureC_36', 'FeatureC_37', 'FeatureD_1', 'Feat

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 74/74 [03:14<00:00,  2.63s/it]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

timestamp_ids = []
usr_ids = []
user_interests = []

for t in timestamp_comp['timestamp']:
    if t == 1:
        continue
    usr_f = usr_comp_features[usr_comp_features['timestamp']==t-1]
    timestamp_ids.extend([t]*len(usr_f))
    usr_ids.extend(usr_f.pop('User_ID'))
    
    comp_f = timestamp_comp[timestamp_comp['timestamp']==t]
    usr_f.drop(['timestamp', 'year', 'month'], axis=1, inplace=True)
    comp_f.drop('timestamp', axis=1, inplace=True)
    
    interests = np.matmul(usr_f.values, comp_f.values.T).flatten()
#     interests = cosine_similarity(usr_f.values, comp_f.values).flatten()
    user_interests.extend(interests)

usr_interest_f = pd.DataFrame({
    "timestamp": timestamp_ids,
    "User_ID": usr_ids,
    "user_interests": user_interests
})

overall = overall.merge(usr_interest_f, how='left')
overall = overall.sort_values(by='timestamp').reset_index(drop=True)
overall['user_interests'] = overall.groupby('User_ID')['user_interests'].apply(lambda x: x.ffill())
overall['user_interests'] = overall['user_interests'].fillna(0)

In [None]:
comp_hist = usr_comp.groupby(['User_ID', 'year', 'month'])['CompID'].nunique()
comp_hist = comp_hist.reset_index()
comp_hist.columns = [*comp_hist.columns[:-1]] + ['Num_Comp_Prev_Month']

overall = overall.merge(comp_hist, how='left')
overall = overall.sort_values(by=['User_ID', 'timestamp']).reset_index(drop=True)
overall['Num_Comp_Prev_Month'] = overall['Num_Comp_Prev_Month'].fillna(0)
overall['Num_Comp_Per_Month'] = overall.groupby('User_ID')['Num_Comp_Prev_Month'].cumsum()
overall['Num_Comp_Per_Month_trend'] = overall['Num_Comp_Per_Month']/overall['Total_Num_User_Months']
overall['Num_Comp_Per_Month_trend'] = overall.groupby('User_ID')['Num_Comp_Per_Month_trend'].apply(lambda x: x.shift())
overall['Num_Comp_Per_Month'] = overall['Num_Comp_Per_Month']/(overall['timestamp'].max() - overall['Zindi_Joining_Timestamp'])
overall['Num_Comp_Per_Month'] = overall.groupby('User_ID')['Num_Comp_Per_Month'].apply(lambda x: x.shift())

overall['Num_Comp_Prev_Month'] = overall.groupby('User_ID')['Num_Comp_Prev_Month'].apply(lambda x: x.shift())
overall['Num_Comp_Prev_Month_momentum'] = overall['Num_Comp_Prev_Month'] - overall.groupby('User_ID')['Num_Comp_Prev_Month'].apply(lambda x: x.shift(1))
overall['Num_Comp_Prev_Month_momentum2'] = overall['Num_Comp_Prev_Month'] - overall.groupby('User_ID')['Num_Comp_Prev_Month'].apply(lambda x: x.shift(2))

In [None]:
tmp = usr_comp.groupby(['User_ID', 'year', 'month', 'PublicRank'])['CompID'].nunique().unstack('PublicRank').apply(lambda x: x/x.sum(), axis=1)
col_names = [tmp.columns.name + "_" + str(col) for col in tmp.columns]
tmp.columns = col_names
tmp = tmp.fillna(0)
tmp = tmp.reset_index()

tmp = all_timestamps.merge(tmp, how='left')
tmp = tmp.sort_values(by='timestamp').reset_index(drop=True)
for col in col_names:
    tmp[col] = tmp.groupby('User_ID')[col].apply(lambda x: x.cumsum().ffill().shift())
    
overall = overall.merge(tmp, how='left')

In [None]:
tmp = usr_comp.groupby(['User_ID', 'year', 'month', 'Successful_Sub_Count'])['CompID'].nunique().unstack('Successful_Sub_Count').apply(lambda x: x/x.sum(), axis=1)
col_names = [tmp.columns.name + "_" + str(col) for col in tmp.columns]
tmp.columns = col_names
tmp = tmp.fillna(0)
tmp = tmp.reset_index()

tmp = all_timestamps.merge(tmp, how='left')
tmp = tmp.sort_values(by='timestamp').reset_index(drop=True)
for col in col_names:
    tmp[col] = tmp.groupby('User_ID')[col].apply(lambda x: x.cumsum().ffill().shift())
    
overall = overall.merge(tmp, how='left')

In [None]:
del usr_comp, usr_comp_timestamp, comp_hist
gc.collect()

0

## Submissions based features

In [None]:
usr_sub = pd.read_csv(os.path.join(Config.DATA_DIR,"Submissions.csv"), index_col=None)
usr_sub.columns = ['User_ID', 'FeatureG', 'CompID', 'year', 'month', 'dayofweek']

if LOCAL_TEST_RUN:
    test_index = (usr_sub['year']==3)&(usr_sub['month'].isin([10,11,12]))
    usr_sub = usr_sub[~test_index].reset_index(drop=True)
    
usr_sub_timestamp = usr_sub.merge(timestamp, how='left')
usr_sub_timestamp = usr_sub_timestamp[['User_ID', 'month', 'year', 'timestamp']].drop_duplicates()
usr_sub_timestamp.columns = ['User_ID', 'month', 'year', 'sub_timestamp']
overall = overall.merge(usr_sub_timestamp, how='left')

overall = overall.sort_values(by='timestamp').reset_index(drop=True)
overall['sub_timestamp'] = overall.groupby('User_ID')['sub_timestamp'].apply(lambda x: x.ffill().shift())
overall['Months_Since_Last_Sub'] = overall['timestamp'] - overall['sub_timestamp']
overall['Months_Since_Sub_Joining_Zindi'] = overall['sub_timestamp'] - overall['Zindi_Joining_Timestamp']

In [None]:
sub_hist = usr_sub.groupby(['User_ID', 'year', 'month']).agg({'CompID': ['nunique', 'count']})
sub_hist.columns = ["_".join(col) for col in sub_hist.columns]
sub_hist['Sub_Per_Comp'] = sub_hist['CompID_nunique']/sub_hist['CompID_count']
sub_hist.drop(['CompID_nunique', 'CompID_count'], axis=1, inplace=True)
sub_hist = sub_hist.reset_index()
sub_hist.columns = [*sub_hist.columns[:-1]] + ['Num_Sub_Prev_Month']

overall = overall.merge(sub_hist, how='left')
overall = overall.sort_values(by=['User_ID', 'timestamp']).reset_index(drop=True)
overall['Num_Sub_Prev_Month'] = overall['Num_Sub_Prev_Month'].fillna(0)
overall['Num_Sub_Per_Month'] = overall.groupby('User_ID')['Num_Sub_Prev_Month'].cumsum()
overall['Num_Sub_Per_Month_trend'] = overall['Num_Sub_Per_Month']/overall['Total_Num_User_Months']
overall['Num_Sub_Per_Month_trend'] = overall.groupby('User_ID')['Num_Sub_Per_Month_trend'].apply(lambda x: x.shift())
overall['Num_Sub_Per_Month'] = overall['Num_Sub_Per_Month']/(overall['timestamp'].max() - overall['Zindi_Joining_Timestamp'])
overall['Num_Sub_Per_Month'] = overall.groupby('User_ID')['Num_Sub_Per_Month'].apply(lambda x: x.shift())

overall['Num_Sub_Prev_Month'] = overall.groupby('User_ID')['Num_Sub_Prev_Month'].apply(lambda x: x.shift())
overall['Num_Sub_Prev_Month_momentum'] = overall['Num_Sub_Prev_Month'] - overall.groupby('User_ID')['Num_Sub_Prev_Month'].apply(lambda x: x.shift(1))
overall['Num_Sub_Prev_Month_momentum2'] = overall['Num_Sub_Prev_Month'] - overall.groupby('User_ID')['Num_Sub_Prev_Month'].apply(lambda x: x.shift(2))

In [None]:
tmp = usr_sub.groupby(['User_ID', 'year', 'month', 'FeatureG'])['CompID'].nunique().unstack('FeatureG')#.apply(lambda x: x/x.sum(), axis=1)
col_names = [tmp.columns.name + "_" + str(col) for col in tmp.columns]
tmp.columns = col_names
tmp = tmp.fillna(0)
tmp = tmp.reset_index()

all_timestamps = overall[['User_ID', 'timestamp', 'year', 'month']].drop_duplicates().reset_index(drop=True)
tmp = all_timestamps.merge(tmp, how='left')
tmp = tmp.sort_values(by='timestamp').reset_index(drop=True)
for col in col_names:
    tmp[col] = tmp.groupby('User_ID')[col].apply(lambda x: x.ffill().shift())
    
overall = overall.merge(tmp, how='left')

In [None]:
del usr_sub, usr_sub_timestamp, sub_hist
gc.collect()

0

## Discussion based features

In [None]:
usr_dis = pd.read_csv(os.path.join(Config.DATA_DIR,"Discussions.csv"), index_col=None)
usr_dis.columns = ['FeatureF', 'year', 'month', 'dayofweek', 'DiscID', 'User_ID']

if LOCAL_TEST_RUN:
    test_index = (usr_dis['year']==3)&(usr_dis['month'].isin([10,11,12]))
    usr_dis = usr_dis[~test_index].reset_index(drop=True)
    
usr_dis_timestamp = usr_dis.merge(timestamp, how='left')
usr_dis_timestamp = usr_dis_timestamp[['User_ID', 'month', 'year', 'timestamp']].drop_duplicates()
usr_dis_timestamp.columns = ['User_ID', 'month', 'year', 'discussion_timestamp']
overall = overall.merge(usr_dis_timestamp, how='left')

overall = overall.sort_values(by='timestamp').reset_index(drop=True)
overall['discussion_timestamp'] = overall.groupby('User_ID')['discussion_timestamp'].apply(lambda x: x.ffill().shift())
overall['Months_Since_Last_Dis'] = overall['timestamp'] - overall['discussion_timestamp']
overall['Months_Since_Dis_Joining_Zindi'] = overall['discussion_timestamp'] - overall['Zindi_Joining_Timestamp']

In [None]:
dis_hist = usr_dis.groupby(['User_ID', 'year', 'month'])['DiscID'].nunique()
dis_hist = dis_hist.reset_index()
dis_hist.columns = [*dis_hist.columns[:-1]] + ['Num_Dis_Prev_Month']

overall = overall.merge(dis_hist, how='left')
overall = overall.sort_values(by=['User_ID', 'timestamp']).reset_index(drop=True)
overall['Num_Dis_Prev_Month'] = overall['Num_Dis_Prev_Month'].fillna(0)
overall['Num_Dis_Per_Month'] = overall.groupby('User_ID')['Num_Dis_Prev_Month'].cumsum()
overall['Num_Dis_Per_Month_trend'] = overall['Num_Dis_Per_Month']/overall['Total_Num_User_Months']
overall['Num_Dis_Per_Month_trend'] = overall.groupby('User_ID')['Num_Dis_Per_Month_trend'].apply(lambda x: x.shift())
overall['Num_Dis_Per_Month'] = overall['Num_Dis_Per_Month']/(overall['timestamp'].max() - overall['Zindi_Joining_Timestamp'])
overall['Num_Dis_Per_Month'] = overall.groupby('User_ID')['Num_Dis_Per_Month'].apply(lambda x: x.shift())

overall['Num_Dis_Prev_Month'] = overall.groupby('User_ID')['Num_Dis_Prev_Month'].apply(lambda x: x.shift())
overall['Num_Dis_Prev_Month_momentum'] = overall['Num_Dis_Prev_Month'] - overall.groupby('User_ID')['Num_Dis_Prev_Month'].apply(lambda x: x.shift(1))
overall['Num_Dis_Prev_Month_momentum2'] = overall['Num_Dis_Prev_Month'] - overall.groupby('User_ID')['Num_Dis_Prev_Month'].apply(lambda x: x.shift(2))

In [None]:
del usr_dis, usr_dis_timestamp, dis_hist
gc.collect()

0

## Comments based features

In [None]:
usr_comments = pd.read_csv(os.path.join(Config.DATA_DIR,"Comments.csv"), index_col=None)
usr_comments.columns = ['User_ID', 'year', 'month', 'dayofweek']
usr_comments['CommID'] = np.arange(len(usr_comments))

if LOCAL_TEST_RUN:
    test_index = (usr_comments['year']==3)&(usr_comments['month'].isin([10,11,12]))
    usr_comments = usr_comments[~test_index].reset_index(drop=True)
    
usr_comm_timestamp = usr_comments.merge(timestamp, how='left')
usr_comm_timestamp = usr_comm_timestamp[['User_ID', 'month', 'year', 'timestamp']].drop_duplicates()
usr_comm_timestamp.columns = ['User_ID', 'month', 'year', 'comment_timestamp']
overall = overall.merge(usr_comm_timestamp, how='left')

overall = overall.sort_values(by='timestamp').reset_index(drop=True)
overall['comment_timestamp'] = overall.groupby('User_ID')['comment_timestamp'].apply(lambda x: x.ffill().shift())
overall['Months_Since_Last_Comment'] = overall['timestamp'] - overall['comment_timestamp']
overall['Months_Since_Comment_Joining_Zindi'] = overall['comment_timestamp'] - overall['Zindi_Joining_Timestamp']

In [None]:
comm_hist = usr_comments.groupby(['User_ID', 'year', 'month'])['CommID'].nunique()
comm_hist = comm_hist.reset_index()
comm_hist.columns = [*comm_hist.columns[:-1]] + ['Num_Comm_Prev_Month']

overall = overall.merge(comm_hist, how='left')
overall = overall.sort_values(by=['User_ID', 'timestamp']).reset_index(drop=True)
overall['Num_Comm_Prev_Month'] = overall['Num_Comm_Prev_Month'].fillna(0)
overall['Num_Comm_Per_Month'] = overall.groupby('User_ID')['Num_Comm_Prev_Month'].cumsum()
overall['Num_Comm_Per_Month_trend'] = overall['Num_Comm_Per_Month']/overall['Total_Num_User_Months']
overall['Num_Comm_Per_Month_trend'] = overall.groupby('User_ID')['Num_Comm_Per_Month_trend'].apply(lambda x: x.shift())
overall['Num_Comm_Per_Month'] = overall['Num_Comm_Per_Month']/(overall['timestamp'].max() - overall['Zindi_Joining_Timestamp'])
overall['Num_Comm_Per_Month'] = overall.groupby('User_ID')['Num_Comm_Per_Month'].apply(lambda x: x.shift())

overall['Num_Comm_Prev_Month'] = overall.groupby('User_ID')['Num_Comm_Prev_Month'].apply(lambda x: x.shift())
overall['Num_Comm_Prev_Month_momentum'] = overall['Num_Comm_Prev_Month'] - overall.groupby('User_ID')['Num_Comm_Prev_Month'].apply(lambda x: x.shift(1))
overall['Num_Comm_Prev_Month_momentum2'] = overall['Num_Comm_Prev_Month'] - overall.groupby('User_ID')['Num_Comm_Prev_Month'].apply(lambda x: x.shift(2))

In [None]:
del usr_comments, usr_comm_timestamp, comm_hist
gc.collect()

0

In [None]:
tmp = overall.groupby('timestamp').agg({
    "User_ID": ["nunique"],
    "Total_Num_User_Months": ["mean", "max", "std"],
})
tmp.columns = ["_".join(col) for col in tmp.columns]
tmp = tmp.reset_index()

overall = overall.merge(tmp, how='left')

In [None]:
sel_cols = ['Months_Since_Last_Comp', 'Months_Since_Last_Dis', 'Months_Since_Last_Sub', 'Months_Since_Last_Comment']
overall['Months_Since_Last_Activity_Mean'] = overall[sel_cols].std(axis=1)

In [None]:
time_cols = [
    'Zindi_Joining_Timestamp',
    'comment_timestamp',
    'comp_timestamp',
    'discussion_timestamp',
    'sub_timestamp',
    'Months_Since_Last_Comp',
    'Months_Since_Last_Sub',
    'Months_Since_Last_Dis',
    'Months_Since_Last_Comment',
]

for col in time_cols:
    overall[col] = overall[col]/overall['timestamp']

In [None]:
tmp_time = overall[overall['Zindi_Joining_Timestamp']==1]
tmp_time = tmp_time.groupby('timestamp')['User_ID'].nunique().to_frame("unique_user_count")
tmp_time = tmp_time.reset_index()

overall = overall.merge(tmp_time, how='left')

In [None]:
overall['user_interests_rank'] = overall.groupby('timestamp')['user_interests'].apply(lambda x: 
                                                                                      x.rank(method='dense', ascending=False))

In [None]:
# overall.loc[overall['user_interests']==0, 'user_interests'] = np.NaN
print(train.shape, test.shape)
train, test = overall[overall['is_train']==1], overall[overall['is_train']==0]
print(train.shape, test.shape)

(259832, 9) (65223, 4)
(259832, 81) (65223, 81)


# Modeling

In [None]:
def train_model(df_trainX, df_trainY, df_evalX, df_evalY, cat_cols, model_name='CAT', params=None):
    from sklearn.metrics import roc_auc_score
    if model_name == 'CAT':
        if params is None:
            params={'n_estimators':10000,'random_state':123,'cat_features':cat_cols}
        clf=CatBoostClassifier(**params,early_stopping_rounds=50,eval_metric='AUC')
        clf.fit(df_trainX,df_trainY,eval_set=(df_evalX,df_evalY),plot=False, verbose=50)
        valid_score = clf.get_best_score().get('validation').get('AUC')
        best_iteration = clf.get_best_iteration()
        feature_score = clf.get_feature_importance()
    elif model_name == 'LGB':
        if params is None:
            params={'verbose':0,'n_estimators':10000,'random_state':123,'learning_rate':0.01,'force_row_wise':True,'colsample_bytree':0.3}
        clf = lgb.LGBMClassifier(**params, importance_type='gain', metric='auc_mu', num_leaves=127, min_child_samples=5)
        callbacks = [lgb.early_stopping(500, verbose=0)]
        clf.fit(df_trainX,
                df_trainY,#)
                eval_set=[(df_evalX, df_evalY)],
                callbacks=callbacks,
                verbose=0
               )

        valid_score = roc_auc_score(df_evalY!='NoActivity', 1-clf.predict_proba(df_evalX)[:,1])
        best_iteration = clf.booster_.best_iteration
        feature_score = clf.feature_importances_
    return clf, valid_score, best_iteration, feature_score

In [None]:
train.to_csv(os.path.join(Config.DATA_DIR,"Train_fe.csv.gz"), compression='gzip')
test.to_csv(os.path.join(Config.DATA_DIR,"Test_fe.csv.gz"), compression='gzip')

In [None]:
%%time

drop_cols = [
    'year', 'month', 'Target', 'Sub', 'CompPart', 'Comment', 'Disc',
    'is_train', 'timestamp', 'Record', 'Active_Month', 'Total_Num_User_Months',
    'user_interests'
]
cat_cols = list(
    set(train.columns[train.dtypes == 'object']) - set(drop_cols) - set(['User_ID'])
)
num_cols = list(set(train.columns) - set(cat_cols + drop_cols))

train_X = train[cat_cols + num_cols]
train_X[cat_cols] = train_X[cat_cols].astype('category')
train_Y = train['Target']

test_X = test[cat_cols + num_cols]
test_X[cat_cols] = test_X[cat_cols].astype('category')

fold = GroupKFold(n_splits=5)
cb_scores, pred_cb, feat_scores = [], [], []
for it, (idxT, idxV) in enumerate(
        fold.split(train_X, train_Y, groups=train['timestamp'])):
    df_trainX, df_trainY = train_X.iloc[idxT], train_Y.iloc[idxT]
    df_evalX, df_evalY = train_X.iloc[idxV], train_Y.iloc[idxV]
    df_testX = test_X.copy()

    selected_cat_cols = ['Country']
    cat_cols_count = [f'{col}_count' for col in selected_cat_cols]
    df_trainX[cat_cols_count] = df_trainX[selected_cat_cols].copy()
    df_evalX[cat_cols_count] = df_evalX[selected_cat_cols].copy()
    df_testX[cat_cols_count] = df_testX[selected_cat_cols].copy()

    encoder = CountEncoder(cols=cat_cols_count + ['User_ID'])
    df_trainX = encoder.fit_transform(df_trainX, df_trainY)
    df_evalX = encoder.transform(df_evalX)
    df_testX = encoder.transform(df_testX)

    clf, valid_score, best_iteration, feature_score = train_model(
        df_trainX, df_trainY, df_evalX, df_evalY, cat_cols, model_name='LGB')
    cb_scores.append(valid_score)
    pred_cb.append(clf.predict_proba(df_testX)[:, 1])
    feat_scores.append(feature_score)
    print('Fold {} {} at {}'.format(it + 1, valid_score, best_iteration))

weights = cb_scores / sum(np.array(cb_scores))
print('The local CV is {}'.format(np.sum(weights * cb_scores)))

Fold 1 0.9050635572161991 at 668
Fold 2 0.9044670487830438 at 913
Fold 3 0.9080523301191019 at 424
Fold 4 0.9024576875811107 at 746
Fold 5 0.9081277085785924 at 1184
The local CV is 0.9056389319496082
CPU times: user 1h 6min 33s, sys: 1min 3s, total: 1h 7min 36s
Wall time: 6min 6s


In [None]:
if LOCAL_TEST_RUN:
    weights=cb_scores/sum(np.array(cb_scores))
    print ('The local CV is {}'.format(np.sum(weights*cb_scores)))

    prediction = np.sum(weights*np.transpose(pred_cb),1)
    from sklearn.metrics import roc_auc_score
    print("Test score is {}".format(roc_auc_score(test['Target']!='NoActivity', 1-prediction)))

In [None]:
featureImp=pd.DataFrame({'feature':df_trainX.columns,'importance':np.mean(np.array(feat_scores),0)})
featureImp=featureImp.sort_values('importance',ascending=False)
featureImp['importance']=featureImp['importance']*100/featureImp['importance'].sum()
featureImp.reset_index(drop=True)

Unnamed: 0,feature,importance
0,Num_Comp_Prev_Month,9.037149
1,Num_Sub_Prev_Month,9.007915
2,Num_Comp_Per_Month_trend,8.885045
3,Num_Comp_Per_Month,7.023674
4,Points,4.678487
5,Current_Active_Competitions,4.198269
6,Zindi_Joining_Timestamp,3.375272
7,sub_timestamp,3.290834
8,Country,2.535924
9,Months_Since_Last_Comp,2.481734


# Submission 

In [None]:
test['Target'] = np.sum(weights * np.transpose(pred_cb), 1)
test['Target'] = 1 - test['Target']
test['UserMonthYear'] = test['User_ID'] + "_" + test['month'].astype(str) + "_" + test['year'].astype(str)
test[['UserMonthYear', 'Target']].to_csv(os.path.join(Config.OUTPUT_DIR, f'{Config.VER}.csv'), index=False)

In [None]:
test[['UserMonthYear', 'Target']]

Unnamed: 0,UserMonthYear,Target
13,ID_000VV0KM_1_4,0.014111
14,ID_000VV0KM_2_4,0.014824
15,ID_000VV0KM_3_4,0.010529
16,ID_003OCIYO_1_4,0.328175
17,ID_003OCIYO_2_4,0.017162
...,...,...
325041,ID_ZZVPF22K_2_4,0.161856
325042,ID_ZZVPF22K_3_4,0.108418
325052,ID_ZZXDLYXB_1_4,0.008774
325053,ID_ZZXDLYXB_2_4,0.008380
