In [1]:
import glob
import csv
import tqdm
import copy
import numpy as np
import pandas as pd
import catboost
import math

In [63]:
pd.set_option('display.max_columns', 800)

# Load track features into dataframe

In [2]:
track_features_dir = "/media/data2/Data/wsdm2019/data/track_features/"
track_feature_files = sorted(glob.glob(track_features_dir+"*.csv"))
track_feature_dir_0 = track_feature_files[0]
track_feature_0 = pd.read_csv(track_feature_dir_0)

track_feature_dir_1 = track_feature_files[1]
track_feature_1 = pd.read_csv(track_feature_dir_1)

track_feature_all = track_feature_0.append(track_feature_1)

In [212]:
# get this so that it's easier to merge with train indexes
track_feature_id_index = track_feature_all.set_index('track_id')

# Load session features into dataframe

In [352]:
train_set_dir = "/media/data2/Data/wsdm2019/data/training_set/"
second_stage_file_dir = "/media/data2/Data/wsdm2019/python/data/second_stage/"

train_files = sorted(glob.glob(train_set_dir+"*.csv"))

catboost_files = train_files[500:]

temp_catboost_train_file_size = 4
temp_catboost_valid_file_size = 1

catboost_train_files = catboost_files[:temp_catboost_train_file_size][0:1]
catboost_valid_files = catboost_files[temp_catboost_train_file_size:temp_catboost_valid_file_size + temp_catboost_train_file_size]

In [364]:
train_df_list = []
valid_df_list = []

for f in catboost_train_files:
    s_f = second_stage_file_dir + f[45:-4] + "_second_stage.csv"
    df_raw = pd.read_csv(f).rename(index=str, columns={"track_id_clean": "track_id"})
    df_f = pd.read_csv(s_f)
    df_raw['logits'] = df_f.iloc[:,0].values
    train_df_list.append(df_raw)    

for f in catboost_valid_files:
    s_f = second_stage_file_dir + f[45:-4] + "_second_stage.csv"
    df_raw = pd.read_csv(f).rename(index=str, columns={"track_id_clean": "track_id"})
    df_f = pd.read_csv(s_f)
    df_raw['logits'] = df_f.iloc[:,0].values
    valid_df_list.append(df_raw)
    
catboost_train_raw = pd.concat(train_df_list, ignore_index=True)
catboost_valid_raw = pd.concat(valid_df_list, ignore_index=True)

In [548]:
catboost_train_raw_group = catboost_train_raw.groupby('session_id')
ngroups = catboost_train_raw_group.ngroups

In [366]:
print(ngroups)

940033


In [368]:
session_dropped_columns = ['session_id', 'track_id', 'session_position']
session_bool_columns = ['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch', 'no_pause_before_play', 'short_pause_before_play',
                'long_pause_before_play', 'hist_user_behavior_is_shuffle', 'premium']
session_numerical_columns = ['hist_user_behavior_n_seekfwd', 'hist_user_behavior_n_seekback']
session_numerical_columns_range_map = {'hist_user_behavior_n_seekfwd': (0,100), 'hist_user_behavior_n_seekback': (0,100)}
session_categorical_columns = ['context_type', 'hist_user_behavior_reason_start', 'hist_user_behavior_reason_end', 'session_length', 'hour_of_day', 'date']

In [369]:
track_dropped_columns = ['track_id']
track_numerical_columns = ['duration', 'us_popularity_estimate', 'acousticness', 'beat_strength', 'bounciness', 'danceability',
                             'dyn_range_mean', 'energy', 'flatness', 'instrumentalness', 'liveness', 'loudness', 'mechanism', 
                             'organism', 'speechiness', 'tempo', 'valence']
track_vector_columns = ['acoustic_vector_0', 'acoustic_vector_1', 'acoustic_vector_2', 
                        'acoustic_vector_3', 'acoustic_vector_4', 'acoustic_vector_5',
                        'acoustic_vector_6', 'acoustic_vector_7']
track_categorical_columns = ['release_year', 'key', 'mode', 'time_signature']

In [504]:
# def get_session_history_summarization(test_group_history):
#     num_feature_list = []
#     cat_feature_list = []
#     for feature in test_group_history.columns:
#         if feature in history_bool_columns:
#             col = test_group_history[feature].astype(int)
#             num_feature_list.extend([col.mean(), col.var(), col.mode()[0]])
#         elif feature in history_numerical_columns:
#             (mi, ma) = history_numerical_columns_range_map[feature]
#             col = test_group_history[feature].clip(mi, ma)
#             col = (col - mi) / (ma - mi)
#             num_feature_list.extend([col.mean(), col.var()])
#         elif feature in history_categorical_columns:
#             col = test_group_history[feature]
#             cat_feature_list.append(col.mode()[0])
#     return num_feature_list, cat_feature_list
def get_session_history_summarization(test_group_history):
    num_feature_list = []
    cat_feature_list = []
    
    for feature in test_group_history.columns:
        if feature in numerical_columns:
            (mi, ma) = history_numerical_columns_range_map[feature]
            col = test_group_history[feature].clip(mi, ma)
            col = (col - mi) / (ma - mi)
            num_feature_list.extend([col.mean(), col.var()])
    
    bool_cols = test_group_history[history_bool_columns].astype(int)
    num_feature_list.extend(bool_cols.mean().tolist())
    num_feature_list.extend(bool_cols.var().tolist())
    num_feature_list.extend(bool_cols.mode().iloc[0].tolist())
    
    cat_cols = test_group_history[history_categorical_columns]
    
    cat_feature_list.extend(cat_cols.mode().iloc[0].astype(str).tolist())
    
    return num_feature_list, cat_feature_list

In [505]:
def get_track_df(track_list):
    return track_feature_id_index.loc[track_list]

In [506]:
def get_current_track_feature(self_track_df):
    num_feature_list = []
    cat_feature_list = []
#     for feature in self_track_df.columns:
#         print(feature)
#     if feature in predict_numerical_columns:
    num_feature_list.extend(self_track_df[predict_numerical_columns].tolist())
    cat_feature_list.extend(self_track_df[predict_categorical_columns].astype(str).tolist())
    return num_feature_list, cat_feature_list

In [507]:
def get_tracks_df_summarization_feature(tracks_df):
    num_feature_list = []
    cat_feature_list = []
    
    num_feature_list.extend(tracks_df[track_numerical_columns].mean().tolist())
#     num_feature_list.extend(tracks_df[track_numerical_columns].var().tolist())
    num_feature_list.extend(tracks_df[track_numerical_columns].min().tolist())
    num_feature_list.extend(tracks_df[track_numerical_columns].max().tolist())
    
    cat_feature_list.extend(tracks_df[track_categorical_columns].mode().iloc[0].astype(str).tolist())
    
    return num_feature_list, cat_feature_list

In [566]:
num_feat_train = []
cat_feat_train = []
target_train = []

In [575]:
counter = 0

num_feat_train = []
cat_feat_train = []
target_train = []

for name, test_group in catboost_train_raw_group:
    
    test_group_history = test_group.iloc[:math.floor(test_group['session_position'].size / 2)].rename(index=str, columns={"track_id_clean": "track_id"})
    test_group_predict_all = test_group.iloc[math.floor(test_group['session_position'].size / 2):].rename(index=str, columns={"track_id_clean": "track_id"})
    test_group_predict = test_group_predict_all.iloc[:,:4]
    test_group_predict['logits'] = test_group_predict_all['logits']
    test_group_target = test_group_predict_all.iloc[:, 5].astype(int)

    target_train.extend(test_group_target.tolist())

    # first get the summarization data
    num_feat_hist, cat_feat_hist = get_session_history_summarization(test_group_history)

    # use the append of two piecese because of the structure of test set
    all_track_ids = test_group_history['track_id'].append(test_group_predict['track_id'])

    # used to fill in 0s if it's the last song in the current history since it don't have after tracks
    num_feat_after_sum_len = 51
    cat_feat_after_sum_len = 4

    isLast = False
    for index, row in test_group_predict.iterrows():
        num_feat_predict_row = []
        cat_feat_predict_row = []

        num_feat_predict_row.extend(num_feat_hist)
        cat_feat_predict_row.extend(cat_feat_hist)

        # add categorical variable session_position (session length is already included in cat_feat_hist)
        session_position = row[1]
        cat_feat_predict_row.append(session_position)

        # use the append of two piecese because of the structure of test set
        entire_track_id = test_group_history['track_id'].append(test_group_predict['track_id'])

        before_tracks = all_track_ids[:(session_position - 1)]
        current_track = all_track_ids[(session_position - 1)]
        after_tracks = all_track_ids[(session_position):]
        if (len(after_tracks) == 0):
            isLast = True

        # count of how many times this item have appeared before and after current track
        before_count = (before_tracks == current_track).sum() / len(before_tracks)
        before_proportion = before_count / len(before_tracks)
        if isLast:
            after_count = 0
            after_proportion = 0
        else:
            after_count = (after_tracks == current_track).sum() / len(after_tracks)
            after_proportion = after_count / len(after_tracks)
        num_feat_predict_row.extend([after_count, after_proportion, before_count, before_proportion])

        # get the three corresponding dataframes
        current_track_df = get_track_df(current_track)
        before_tracks_df = get_track_df(before_tracks)
        after_tracks_df = get_track_df(after_tracks)

        # get current track data
        num_feat_current, cat_feat_current = get_current_track_feature(current_track_df)
        num_feat_predict_row.extend(num_feat_current)
        cat_feat_predict_row.extend(cat_feat_current)

        # get before tracks summarization data
        num_feat_before_sum, cat_feat_before_sum = get_tracks_df_summarization_feature(before_tracks_df)
        num_feat_predict_row.extend(num_feat_before_sum)
        cat_feat_predict_row.extend(cat_feat_before_sum)

        # get after tracks summarizaiton data
        if isLast:
            num_feat_after_sum = [0] * num_feat_after_sum_len
            cat_feat_after_sum = ['last'] * cat_feat_after_sum_len
        else:
            num_feat_after_sum, cat_feat_after_sum = get_tracks_df_summarization_feature(after_tracks_df)

        num_feat_predict_row.extend(num_feat_after_sum)
        cat_feat_predict_row.extend(cat_feat_after_sum)

        # add numerical feature logits
        logit = row[4]
        num_feat_predict_row.append(logit)

        num_feat_train.append(num_feat_predict_row)
        cat_feat_train.append(cat_feat_predict_row)
        
    counter += 1
    if counter % 100 == 0:
        print(counter)
        
    if counter == 1000:
        break

100
200
300
400
500
600
700
800
900
1000


In [577]:
len(num_feat_train)

8334

In [578]:
len(cat_feat_train)

8334

In [579]:
len(target_train)

8334

In [580]:
num_feat_valid = num_feat_train[-1000:]
cat_feat_valid = cat_feat_train[-1000:]
target_valid = target_train[-1000:]

In [582]:
n_train = num_feat_train[:-1000]
c_train = cat_feat_train[:-1000]
t = target_train[:-1000]

In [597]:
train = [a+b for a,b in zip(c_train, n_train)]
valid = [a+b for a,b in zip(cat_feat_valid, num_feat_valid)]

In [590]:
from catboost import CatBoostClassifier, Pool
p = Pool(train, t, list(range(19)))

In [600]:
cb_model = CatBoostClassifier(iterations=500,
                              eval_metric='AUC',
                              learning_rate=0.1,
                              objective='Precision'
                              max_depth=15)

In [601]:
cb_model.fit(p)

0:	learn: 0.6876328	total: 98.5ms	remaining: 49.1s
1:	learn: 0.6825583	total: 191ms	remaining: 47.5s
2:	learn: 0.6783092	total: 271ms	remaining: 44.9s
3:	learn: 0.6737668	total: 360ms	remaining: 44.6s
4:	learn: 0.6686857	total: 448ms	remaining: 44.3s
5:	learn: 0.6640080	total: 522ms	remaining: 43s
6:	learn: 0.6593562	total: 603ms	remaining: 42.5s
7:	learn: 0.6551011	total: 677ms	remaining: 41.6s
8:	learn: 0.6503446	total: 761ms	remaining: 41.5s
9:	learn: 0.6458050	total: 846ms	remaining: 41.4s
10:	learn: 0.6419549	total: 934ms	remaining: 41.5s
11:	learn: 0.6386669	total: 1.02s	remaining: 41.5s
12:	learn: 0.6355102	total: 1.09s	remaining: 40.9s
13:	learn: 0.6318590	total: 1.18s	remaining: 41s
14:	learn: 0.6287358	total: 1.26s	remaining: 40.9s
15:	learn: 0.6255940	total: 1.35s	remaining: 40.7s
16:	learn: 0.6226129	total: 1.43s	remaining: 40.5s
17:	learn: 0.6207163	total: 1.47s	remaining: 39.5s
18:	learn: 0.6175146	total: 1.55s	remaining: 39.3s
19:	learn: 0.6147962	total: 1.64s	remaining:

<catboost.core.CatBoostClassifier at 0x7f8e712182b0>

In [603]:
np.sum(cb_model.predict(valid) == target_valid)

679

In [553]:
def process_group(name, test_group):
    num_feat_train = []
    cat_feat_train = []
    target_train = []
    
    test_group_history = test_group.iloc[:math.floor(test_group['session_position'].size / 2)].rename(index=str, columns={"track_id_clean": "track_id"})
    test_group_predict_all = test_group.iloc[math.floor(test_group['session_position'].size / 2):].rename(index=str, columns={"track_id_clean": "track_id"})
    test_group_predict = test_group_predict_all.iloc[:,:4]
    test_group_predict['logits'] = test_group_predict_all['logits']
    test_group_target = test_group_predict_all.iloc[:, 5].astype(int)

    target_train.extend(test_group_target.tolist())

    # first get the summarization data
    num_feat_hist, cat_feat_hist = get_session_history_summarization(test_group_history)

    # use the append of two piecese because of the structure of test set
    all_track_ids = test_group_history['track_id'].append(test_group_predict['track_id'])

    # used to fill in 0s if it's the last song in the current history since it don't have after tracks
    num_feat_after_sum_len = 51
    cat_feat_after_sum_len = 4

    isLast = False
    for index, row in test_group_predict.iterrows():
        num_feat_predict_row = []
        cat_feat_predict_row = []

        num_feat_predict_row.extend(num_feat_hist)
        cat_feat_predict_row.extend(cat_feat_hist)

        # add categorical variable session_position (session length is already included in cat_feat_hist)
        session_position = row[1]
        cat_feat_predict_row.append(session_position)

        # use the append of two piecese because of the structure of test set
        entire_track_id = test_group_history['track_id'].append(test_group_predict['track_id'])

        before_tracks = all_track_ids[:(session_position - 1)]
        current_track = all_track_ids[(session_position - 1)]
        after_tracks = all_track_ids[(session_position):]
        if (len(after_tracks) == 0):
            isLast = True

        # count of how many times this item have appeared before and after current track
        before_count = (before_tracks == current_track).sum() / len(before_tracks)
        before_proportion = before_count / len(before_tracks)
        if isLast:
            after_count = 0
            after_proportion = 0
        else:
            after_count = (after_tracks == current_track).sum() / len(after_tracks)
            after_proportion = after_count / len(after_tracks)
        num_feat_predict_row.extend([after_count, after_proportion, before_count, before_proportion])

        # get the three corresponding dataframes
        current_track_df = get_track_df(current_track)
        before_tracks_df = get_track_df(before_tracks)
        after_tracks_df = get_track_df(after_tracks)

        # get current track data
        num_feat_current, cat_feat_current = get_current_track_feature(current_track_df)
        num_feat_predict_row.extend(num_feat_current)
        cat_feat_predict_row.extend(cat_feat_current)

        # get before tracks summarization data
        num_feat_before_sum, cat_feat_before_sum = get_tracks_df_summarization_feature(before_tracks_df)
        num_feat_predict_row.extend(num_feat_before_sum)
        cat_feat_predict_row.extend(cat_feat_before_sum)

        # get after tracks summarizaiton data
        if isLast:
            num_feat_after_sum = [0] * num_feat_after_sum_len
            cat_feat_after_sum = ['last'] * cat_feat_after_sum_len
        else:
            num_feat_after_sum, cat_feat_after_sum = get_tracks_df_summarization_feature(after_tracks_df)

        num_feat_predict_row.extend(num_feat_after_sum)
        cat_feat_predict_row.extend(cat_feat_after_sum)

        # add numerical feature logits
        logit = row[4]
        num_feat_predict_row.append(logit)

        num_feat_train.append(num_feat_predict_row)
        cat_feat_train.append(cat_feat_predict_row)
        
    return {
        'name': name,
        'num_feat_train': num_feat_train,
        'cat_feat_train': cat_feat_train,
        'target_train': target_train
    }

In [604]:
import multiprocessing

pool = multiprocessing.Pool(processes=32)

In [None]:
result = pool.map(process_group, catboost_train_raw_group)

In [None]:
pool.close()
pool.join()