In [80]:
import numpy as np
import pandas as pd
import lightfm
from lightfm import data as ld
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRanker

In [81]:
wsl_train_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/train.csv"
wsl_test_path = "/mnt/c/Users/denis/PycharmProjects/recsys-course-spring-2025/hw/test.csv"
train_data = pd.read_csv(wsl_train_path)
test_data = pd.read_csv(wsl_test_path)

In [82]:
user_id_map = {id: i for i, id in enumerate(train_data['user'].unique())}
train_data["user_id"] = train_data["user"].map(user_id_map).map(int)
test_data["user_id"] = test_data["user"].map(user_id_map).map(int)

In [83]:
users_id = sorted(set(train_data["user_id"]))
max_track_id = max(train_data['track'].max(), test_data['track'].max())

In [84]:
positives = train_data[train_data["time"] > 0.7].copy()

In [85]:
dataset = ld.Dataset()
dataset.fit(users_id, range(max_track_id + 1))

all_interactions, _ = dataset.build_interactions(positives[['user_id', 'track']].itertuples(index=False, name=None))

In [86]:
model = lightfm.LightFM(
    no_components=300,
    loss='warp',
    learning_rate=0.01,
    max_sampled=90,
    user_alpha=0.0001,
    item_alpha=0.0001
)

In [87]:
model.fit(all_interactions, epochs=400, verbose=True, num_threads=8)    

Epoch: 100%|██████████| 400/400 [14:50<00:00,  2.23s/it]


<lightfm.lightfm.LightFM at 0x7f026fe7a4c0>

In [88]:
train_data["lightfm_score"] = model.predict(train_data["user_id"].values, train_data["track"].values)
test_data["lightfm_score"] = model.predict(test_data["user_id"].values, test_data["track"].values)

In [155]:
tr_d = train_data
tes_d = test_data

In [89]:
# track_popularity = train_data.groupby('track').size().reset_index(name='popularity')
# train_data = train_data.merge(track_popularity, on='track')
# test_data = test_data.merge(track_popularity, on='track', how='left').fillna(0)

In [90]:
SIGNIFICANT_LISTEN_THRESHOLD = 0.3

significant_listens = train_data[train_data['time'] > SIGNIFICANT_LISTEN_THRESHOLD]

track_popularity = significant_listens.groupby('track').size().reset_index(name='popularity')

train_data = train_data.merge(track_popularity, on='track', how='left')
test_data = test_data.merge(track_popularity, on='track', how='left')

train_data['popularity'] = train_data['popularity'].fillna(0)
test_data['popularity'] = test_data['popularity'].fillna(0)


In [144]:
def prepare_data_for_lambdarank(df, train_flag=True):
    # features = ['lightfm_score']
    features = ['lightfm_score', 'popularity']
    X = df[features]
    if train_flag:
        y = df['time']
    else:
        y = None
    groups = df.groupby('user_id').size().values
    return X, y, groups

In [145]:
train_data = train_data.sort_values(by=['user_id', 'time'], ascending=[True, False])
test_data = test_data.sort_values(by=['user_id'])

In [146]:
X_train, y_train, groups_train = prepare_data_for_lambdarank(train_data)
X_test, _, groups_test = prepare_data_for_lambdarank(test_data, train_flag=False)

In [147]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# def to_rank(scores, num_ranks=10):
#     return np.digitize(scores, bins=np.linspace(0, 1, num_ranks+1)[1:-1]).astype(int)
# 
# X_train_ranks = to_rank(X_train_scaled)
# X_test_ranks = to_rank(X_test_scaled)

In [148]:
# y_train_int = (y_train * 30).astype(int)

In [149]:
from sklearn.preprocessing import KBinsDiscretizer

# Разбиваем time на 5-10 дискретных рангов
discretizer = KBinsDiscretizer(n_bins=6, encode='ordinal', strategy='quantile')
y_train_disc = discretizer.fit_transform(y_train.values.reshape(-1, 1)).astype(int)



In [150]:
ranker = XGBRanker(
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    learning_rate=0.005,
    n_estimators=800,
    
    # lambdarank_num_pair_per_sample=10,
    # lambdarank_pair_method='topk',
    
    early_stopping_rounds=50,
    random_state=42,
    max_depth=7,
    n_jobs=-1,
    ndcg_exp_gain=False,
)

In [151]:
# ranker.fit(X_train, y_train, group=groups_train, eval_set=[(X_train, y_train)], 
#            eval_group=[groups_train], verbose=True)

ranker.fit(X_train_scaled, y_train_disc, group=groups_train, eval_set=[(X_train_scaled, y_train)], 
           eval_group=[groups_train], verbose=True)

[0]	validation_0-ndcg@32:0.93531
[1]	validation_0-ndcg@32:0.93095
[2]	validation_0-ndcg@32:0.93041
[3]	validation_0-ndcg@32:0.92972
[4]	validation_0-ndcg@32:0.92347
[5]	validation_0-ndcg@32:0.92241
[6]	validation_0-ndcg@32:0.92227
[7]	validation_0-ndcg@32:0.92193
[8]	validation_0-ndcg@32:0.92163
[9]	validation_0-ndcg@32:0.92165
[10]	validation_0-ndcg@32:0.92105
[11]	validation_0-ndcg@32:0.92097
[12]	validation_0-ndcg@32:0.92093
[13]	validation_0-ndcg@32:0.92068
[14]	validation_0-ndcg@32:0.92064
[15]	validation_0-ndcg@32:0.92036
[16]	validation_0-ndcg@32:0.92036
[17]	validation_0-ndcg@32:0.92017
[18]	validation_0-ndcg@32:0.92017
[19]	validation_0-ndcg@32:0.92019
[20]	validation_0-ndcg@32:0.92019
[21]	validation_0-ndcg@32:0.92019
[22]	validation_0-ndcg@32:0.92019
[23]	validation_0-ndcg@32:0.92019
[24]	validation_0-ndcg@32:0.92018
[25]	validation_0-ndcg@32:0.92010
[26]	validation_0-ndcg@32:0.91996
[27]	validation_0-ndcg@32:0.91996
[28]	validation_0-ndcg@32:0.91775
[29]	validation_0-ndcg@3

In [152]:
test_data["score"] = ranker.predict(X_test_scaled)

In [153]:
test_data[["user", "track", "score"]].to_csv("./data/lambda_r/test_lambda_r_7.csv", index=False)

In [154]:
import numpy as np
import pandas as pd
import lightfm
from lightfm import data as ld
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRanker

# ... (предыдущий код остается без изменений)

# Подготовка данных для LambdaRank
def prepare_data_for_lambdarank(df, train_flag=True):
    features = ['lightfm_score']
    X = df[features]
    if train_flag:
        # Преобразуем 'time' в дискретные ранги
        y = pd.qcut(df['time'], q=5, labels=False)
    else:
        y = None
    groups = df.groupby('user_id').size().values
    return X, y, groups

X_train, y_train, groups_train = prepare_data_for_lambdarank(train_data)
X_test, _, groups_test = prepare_data_for_lambdarank(test_data, train_flag=False)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ranker = XGBRanker(
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    learning_rate=0.1,
    early_stopping_rounds=50,
    random_state=42,
    max_depth=6,
    n_estimators=100,
    n_jobs=-1,
)

ranker.fit(X_train_scaled, y_train, group=groups_train, eval_set=[(X_train_scaled, y_train)], 
           eval_group=[groups_train], verbose=True)

# Предсказание и сохранение результатов
test_data["final_score"] = ranker.predict(X_test_scaled)

# Сохранение результатов
test_data[["user", "track", "final_score"]].to_csv("./data/lambda_r/test_lambda_r_1.csv", index=False)


ValueError: Bin edges must be unique: array([0.        , 0.        , 0.04      , 0.3       , 0.66666667,
       1.        ]).
You can drop duplicate edges by setting the 'duplicates' kwarg