### Data preprocessing

In [16]:
import pandas as pd
import numpy as np

import scipy.stats as ss
from sklearn.metrics import ndcg_score

In [17]:
train_df = pd.read_csv('/Users/nadys/recsys/hw01_data/train.csv')
test_df = pd.read_csv('/Users/nadys/recsys/hw01_data/test.csv')

In [18]:
np.random.seed(42)
train_df['rnd'] = np.random.random(len(train_df))

In [19]:
users = sorted(set(train_df['user']))
tracks = sorted(set(train_df['track']))
len(users), len(tracks)

(10000, 50000)

In [20]:
user_to_id = dict()
id_to_user = dict()
for i, u in enumerate(users):
    user_to_id[u] = i
    id_to_user[i] = u

In [21]:
train_users_ids = [user_to_id[u] for u in train_df['user']]
test_users_ids = [user_to_id[u] for u in test_df['user']]

In [22]:
train_df['user_id'] = train_users_ids
test_df['user_id'] = test_users_ids

In [23]:
good_train_df = train_df[train_df['time'] > 0.5].copy()

##  lightfm

In [24]:
import lightfm
import lightfm.data as ld
import lightfm.evaluation as lv

In [25]:
good_train_df = train_df[train_df['time'] > 0.65].copy()

v30 0.6

v32 0.55

v33 0.65

v34 0.50

v35 0.70

v36 new hyperparams 0.65

v37 no_components 40

v38 no_components 80

In [26]:
dataset = ld.Dataset()
dataset.fit(users, tracks)

all_interactions, _ = dataset.build_interactions(good_train_df[['user', 'track']].itertuples(index=False, name=None))

   0.941
   
   no_components=50,
    loss='warp',
    learning_rate=0.01,
    max_sampled=50,
    user_alpha=0.0,
    item_alpha=0.0001

In [27]:
model = lightfm.LightFM(
    no_components=120,
    loss='warp',
    learning_rate=0.01,
    max_sampled=50,
    user_alpha=0.0,
    item_alpha=0.0001
)

model.fit(all_interactions, epochs=400, verbose=True, num_threads=8)

Epoch: 100%|██████████| 400/400 [16:50<00:00,  2.53s/it]


<lightfm.lightfm.LightFM at 0x1552ebd00>

# Exporting results

In [28]:
submission_users_ids = np.array(test_users_ids)
submission_items_ids = np.array([test_df['track']]).flatten()

In [29]:
predictions = model.predict(submission_users_ids, submission_items_ids)
predictions

array([ 1.0187615, -1.0247685, -1.1493014, ..., -2.01624  , -1.1689632,
       -1.6350068], dtype=float32)

In [30]:
res_df = test_df[['user', 'track']].copy()
res_df['score'] = predictions
res_df.to_csv('submission_v51.csv', index=False)