In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
data_dir = "kkbox-music-recommendation-challenge"

Создаем DF

In [3]:
train_columns = {
  'msno': 'category',
  'song_id': 'category',
  'source_system_tab': 'category',
  'source_screen_name': 'category',
  'source_type': 'category',
  'target': np.uint8,
}

train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), dtype=train_columns)

In [4]:
train_df.source_system_tab = train_df.source_system_tab.cat.add_categories('<UNK>').fillna('<UNK>')
train_df.source_screen_name = train_df.source_screen_name.cat.add_categories('<UNK>').fillna('<UNK>')
train_df.source_type = train_df.source_type.cat.add_categories('<UNK>').fillna('<UNK>')

In [5]:
train_df.dtypes

msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
dtype: object

In [6]:
songs_columns = {
  'song_id': 'category',
  'song_length': np.int32,
  'genre_ids': 'category',
  'artist_name': 'category',
  'composer': 'category',
  'lyricist': 'category',
  'language': 'category',
}

songs_df = pd.read_csv(os.path.join(data_dir, 'songs.csv'), dtype=songs_columns)

In [7]:
songs_df.genre_ids = songs_df.genre_ids.cat.add_categories('<UNK>').fillna('<UNK>')
songs_df.artist_name = songs_df.artist_name.cat.add_categories('<UNK>').fillna('<UNK>')
songs_df.composer = songs_df.composer.cat.add_categories('<UNK>').fillna('<UNK>')
songs_df.lyricist = songs_df.lyricist.cat.add_categories('<UNK>').fillna('<UNK>')

In [8]:
songs_df.dtypes

song_id        category
song_length       int32
genre_ids      category
artist_name    category
composer       category
lyricist       category
language       category
dtype: object

In [9]:
song_extra_info_columns = {
  'song_id': 'category',
  'name': 'category',
  'isrc': 'category',
}

song_extra_info_df = pd.read_csv(os.path.join(data_dir, 'song_extra_info.csv'), dtype=song_extra_info_columns)


In [10]:
song_extra_info_df.isrc = song_extra_info_df.isrc.cat.add_categories('<UNK>').fillna('<UNK>')

In [11]:
def isrc_to_year(isrc):
  if isrc == '<UNK>':
    return '<UNK>'
  else:
    if int(isrc[5:7]) > 17:
      return 1900 + int(isrc[5:7])
    else:
      return 2000 + int(isrc[5:7])

def isrc_to_country(isrc):
  if isrc == '<UNK>':
    return '<UNK>'
  else:
    return isrc[:2]

In [12]:
song_extra_info_df['song_year'] = song_extra_info_df['isrc'].apply(isrc_to_year).astype("category")
song_extra_info_df['song_country'] = song_extra_info_df['isrc'].apply(isrc_to_country).astype("category")

song_extra_info_df.drop(['isrc', 'name'], axis=1, inplace=True)

In [13]:
song_extra_info_df.dtypes

song_id         category
song_year       category
song_country    category
dtype: object

In [14]:
members_columns = {
  'msno': 'category',
  'city': 'category',
  'bd': np.uint8,
  'gender': 'category',
  'registered_via': 'category',
  'registration_init_time': str,
  'expiration_date': str,
}

members_df = pd.read_csv(os.path.join(data_dir, 'members.csv'), dtype=members_columns)

In [15]:
members_df.gender = members_df.gender.cat.add_categories('<UNK>').fillna('<UNK>')

In [16]:
members_df.drop(['registration_init_time', 'expiration_date'], axis=1, inplace=True)
members_df.head(50)

Unnamed: 0,msno,city,bd,gender,registered_via
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,0,<UNK>,7
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,0,<UNK>,7
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,0,<UNK>,4
3,mCuD+tZ1hERA/o5GPqk38e041J8ZsBaLcu7nGoIIvhI=,1,0,<UNK>,9
4,q4HRBfVSssAFS9iRfxWrohxuk9kCYMKjHOEagUMV6rQ=,1,0,<UNK>,4
5,zgPOEyUn5a/Fvuzb3m69ajzxjkbblVtObglW89FzLdo=,13,43,female,9
6,Sw9AT8QoR4wWiNUqHZUH6g5ahzGUx4lo1g+Y3xE2f2M=,1,0,<UNK>,4
7,pg6bT2XZkSP1TDBy4qn3HBPY/HffKQ/bg8WIISQYBSY=,1,0,<UNK>,7
8,kfk1AdTNH2dNqF5LzIs4e0vwGPejw2jrnFjJlcYnEgk=,1,0,<UNK>,7
9,tscijwx4dbEp0NXGl+iFtHJ8zrj+TkcMrduOQk9t+gE=,1,0,<UNK>,7


In [17]:
members_df.dtypes

msno              category
city              category
bd                   uint8
gender            category
registered_via    category
dtype: object

In [18]:
songs_df = songs_df.merge(song_extra_info_df, on="song_id", how="left")
train_df = train_df.merge(songs_df, on="song_id", how="left")
train_df = train_df.merge(members_df, on="msno", how="left")

In [19]:
for col in ['msno', 'song_id']:
    train_df[col] = train_df[col].astype('category')

In [20]:
train_df.dtypes

msno                  category
song_id               category
source_system_tab     category
source_screen_name    category
source_type           category
target                   uint8
song_length            float64
genre_ids             category
artist_name           category
composer              category
lyricist              category
language              category
song_year             category
song_country          category
city                  category
bd                       uint8
gender                category
registered_via        category
dtype: object

1

In [21]:
na_mask = train_df.isna().any(1)
print(na_mask.sum())

1605


In [22]:
train_df = train_df[~na_mask]

In [23]:
train_df = train_df.groupby('msno').head(1023).reset_index(drop=True)

In [24]:
train_df.sort_values('msno', kind='stable', inplace=True)


Metrics

In [25]:
from sklearn.metrics import roc_auc_score

In [26]:
def dcg(score, relevance):
    idx = np.argsort(-score)
    return np.sum(relevance[idx] / np.log2(1 + np.arange(1, len(score) + 1)))

def idcg(score, relevance):
    n = len(score)
    return np.sum(-np.sort(-relevance) / np.log2(1 + np.arange(1, n + 1)))

def ndcg(query, score, relevance):
    query_labels = np.unique(query)

    ndcgs = []
    for label in query_labels:
        mask = label == query

        q_dcg = dcg(score[mask], relevance[mask])
        q_idcg = idcg(score[mask], relevance[mask])

        if q_idcg > 0:
            ndcgs.append(q_dcg / q_idcg)

    return np.mean(ndcgs)

def auc(query, score, relevance):
    query_labels = np.unique(query)

    aucs = []
    for label in query_labels:
        mask = label == query
        try:
            auc = roc_auc_score(relevance[mask], score[mask])
            aucs.append(auc)
        except:
            pass

    return np.mean(aucs)

CatBoost

In [27]:
from catboost import CatBoostRanker, Pool
from sklearn.model_selection import GroupKFold, train_test_split

In [28]:
def to_pool(df):
    X = df.drop(columns=['target', 'song_id', 'msno'], axis=1)
    y = df.target.to_numpy()
    q = df.msno.cat.codes.to_numpy()
    cat_features = X.select_dtypes(['category']).columns.to_numpy()
    return Pool(data=X, label=y, group_id=q, cat_features=cat_features, has_header=True)

In [29]:
class CatBoostModel:
    def __init__(self, loss_function, iterations, task_type=None, random_seed=727272):
        self.model = CatBoostRanker(loss_function=loss_function, iterations=iterations, task_type=task_type, random_seed=random_seed)

    def fit(self, df):
        self.model.fit(to_pool(df))
        return self

    def predict(self,df):
        return self.model.predict(to_pool(df))

    def save_scores(self, df):
        metrics = {"NDCG": [], "ROC_AUC": []}

        group_kfold = GroupKFold(n_splits=5)
        X = df.drop(columns=['target', 'song_id', 'msno'], axis=1)
        q = df.msno.cat.codes.to_numpy()
        for train_index, test_index in group_kfold.split(X, groups=q):
            train_df = df.iloc[sorted(train_index)].reset_index(drop=True)
            test_df = df.iloc[sorted(test_index)].reset_index(drop=True)

            self.fit(train_df)

            score = self.predict(test_df)
            query = test_df.msno.cat.codes.to_numpy()
            label = test_df.target.to_numpy()

            metrics['NDCG'].append(ndcg(query, score, label))
            metrics['ROC_AUC'].append(auc(query, score, label))

        return metrics


In [None]:
model = CatBoostModel('YetiRank', 150, 'GPU')
scores = model.save_scores(train_df)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


Groupwise loss function. OneHotMaxSize set to 10


Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric PFound is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	total: 8.99s	remaining: 22m 19s
1:	total: 17.9s	remaining: 22m 3s
2:	total: 27s	remaining: 22m 2s
3:	total: 36s	remaining: 21m 53s
4:	total: 45s	remaining: 21m 46s
5:	total: 55.7s	remaining: 22m 16s
6:	total: 1m 4s	remaining: 22m
7:	total: 1m 15s	remaining: 22m 18s
8:	total: 1m 24s	remaining: 22m 1s
9:	total: 1m 33s	remaining: 21m 46s
10:	total: 1m 44s	remaining: 21m 55s


In [None]:
print(f"Average NDCG: {np.mean(scores['NDCG']):.4f}")
print(f"Average ROC AUC per user: {np.mean(scores['ROC_AUC']):.4f}")

In [26]:
# X = train_df.drop(columns=['target', 'song_id', 'msno'], axis=1)
# y = train_df.target.to_numpy()
# q = train_df.msno.cat.codes.to_numpy()
# cat_features = X.select_dtypes(['category']).columns.to_numpy()

In [31]:
# parameters = {
#     'loss_function': 'YetiRank',
#     'iterations': 150,
#     'custom_metric': ['NDCG', 'QueryAUC:type=Ranking'],
#     'random_seed': 0,
# }

In [32]:
# scores = []
# for train_idx, val_idx in GroupKFold(n_splits=5).split(X, y, q):
#     X_train = X.iloc[train_idx]
#     y_train = y[train_idx]
#
#     X_val = X.iloc[val_idx]
#     y_val = y[val_idx]
#
#     q_train = q[train_idx]
#     q_val = q[val_idx]
#
#     pool_train = Pool(data=X_train, label=y_train, group_id=q_train, cat_features=cat_features, has_header=True)
#     pool_val = Pool(data=X_val, label=y_val, group_id=q_val, cat_features=cat_features, has_header=True)
#
#     model = CatBoostRanker(**parameters)
#     model.fit(pool_train, eval_set=pool_val)
#
#     scores.append(model.get_evals_result())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


Groupwise loss function. OneHotMaxSize set to 10
0:	test: 0.4403550	best: 0.4403550 (0)	total: 47.8s	remaining: 1h 58m 46s
1:	test: 0.4418691	best: 0.4418691 (1)	total: 1m 35s	remaining: 1h 58m 22s
2:	test: 0.4412057	best: 0.4418691 (1)	total: 2m 23s	remaining: 1h 57m 5s
3:	test: 0.4411647	best: 0.4418691 (1)	total: 3m 11s	remaining: 1h 56m 22s
4:	test: 0.4411581	best: 0.4418691 (1)	total: 3m 57s	remaining: 1h 54m 42s
5:	test: 0.4432833	best: 0.4432833 (5)	total: 4m 42s	remaining: 1h 53m 3s


KeyboardInterrupt: 