In [1]:
!pip install lightgbm



In [2]:
import os
from data_split import *
from feature_additions import *

data_path = 'data'
events_df = pd.read_csv('data/events.csv')  # [user_id, item_id, rating, timestamp]

user_features_df = pd.read_csv(os.path.join(data_path,'user_features.csv')) # [user_id, gender, age]
item_features_df = pd.read_csv('./data/item_features.csv')

train_df, val_df, test_df = split_data_by_user(events_df)

user_features_from_train = get_user_features_from_train(train_df, item_features_df)
item_features_from_train = get_item_features_from_train(train_df, item_features_df)

# Присоединение признаков к тестовому набору данных
train_df_with_features = join_features(train_df, user_features_from_train, item_features_from_train)
test_df_with_features = join_features(test_df, user_features_from_train, item_features_from_train)
val_df_with_features = join_features(val_df, user_features_from_train, item_features_from_train)


target_col = ['rating']
X_train, y_train = train_df_with_features.drop(target_col, axis = 1), train_df_with_features[target_col]
X_test, y_test = test_df_with_features.drop(target_col, axis = 1), test_df_with_features[target_col]
X_val, y_val = val_df_with_features.drop(target_col, axis = 1), val_df_with_features[target_col]

In [3]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

# Параметры модели
params = {
    'objective': 'regression',      # Можно использовать 'lambdarank' для ранжирования
    'metric': 'rmse',               # Метрика регрессии, для ранжирования можно использовать NDCG
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

# Обучение модели
model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=1000)

# Предсказание на валидации для оценки
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

from sklearn.metrics import mean_squared_error
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"RMSE на валидационных данных: {rmse_val}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6120
[LightGBM] [Info] Number of data points in the train set: 882069, number of used features: 25
[LightGBM] [Info] Start training from score 3.580824
RMSE на валидационных данных: 0.9402131023666197




In [None]:
import itertools

top_k = 10

def get_top_k_recommendations(user_id, k=top_k, df=test_df):
    user_predictions = df[df['user_id'] == user_id]
    top_k_items = user_predictions.nlargest(k, 'predicted_rating')['item_id'].values
    return list(top_k_items)

def make_candidates_generations(data: pd.DataFrame,model = model,  model_name : str = 'LightGBM', k = 10):
    users = data['user_id'].unique()
    items = data['item_id'].unique()
    all_combinations = pd.DataFrame(list(itertools.product(users, items)), columns=['user_id', 'item_id'])
    all_combinations_featured = join_features(all_combinations, user_features_from_train, item_features_from_train)
    if model_name == 'LightGBM':
        all_combinations_featured['predicted_rating'] = model.predict(all_combinations_featured, num_iteration=model.best_iteration)
    user_top_k_recommendations = {user_id: get_top_k_recommendations(user_id, df=all_combinations_featured) for user_id in users}

    recommendations = pd.DataFrame({
    'user_id': user_top_k_recommendations.keys(),
    'recommended_items': user_top_k_recommendations.values()
    })

    return recommendations

recommendations_LGBM = make_candidates_generations(train_df, model = model, model_name='LightGBM', k = 10)

In [5]:

# def get_top_k_recommendations(user_id, k=top_k, df=test_df):
#     user_predictions = df[df['user_id'] == user_id]
#     top_k_items = user_predictions.nlargest(k, 'predicted_rating')['item_id'].values
#     return list(top_k_items)
# users = test_df['user_id'].unique()
# items = test_df['item_id'].unique()

# all_combinations_test = pd.DataFrame(list(itertools.product(users, items)), columns=['user_id', 'item_id'])
# all_combinations_test = join_features(all_combinations_test, user_features_from_train, item_features_from_train)
# # X_test, y_test = all_combinations_test.drop(target_col, axis = 1), all_combinations_test[target_col]
# X_test = all_combinations_test
# all_combinations_test['predicted_rating'] = model.predict(X_test, num_iteration=model.best_iteration)

# # Для каждого пользователя выбираем топ-10 фильмов в тестовом наборе
# user_top_k_recommendations_test = {user_id: get_top_k_recommendations(user_id, df=all_combinations_test) for user_id in test_df.user_id}

# # Преобразуем результат в DataFrame для теста
# recommendations_test = pd.DataFrame({
#     'user_id': user_top_k_recommendations_test.keys(),
#     'recommended_items': user_top_k_recommendations_test.values()
# })


In [9]:
test_df = test_df.reset_index()
recommendations_test['true'] = test_df['item_id'].apply(lambda x: [x])

In [12]:
import numpy as np
recommendations_test['recommended_items'] = recommendations_test['recommended_items'].apply(lambda x: np.array(x))
recommendations_test['true'] = recommendations_test['true'].apply(lambda x: np.array(x))



In [13]:
from recall_at_k import *
recall_at_k(recommendations_test, k = 10, y_test = 'true', y_pred = 'recommended_items')

ValueError: setting an array element with a sequence.

In [None]:
recommendations_test['recommended_items'] = recommendations_test['recommended_items'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
recommendations_test.to_csv('recommendations_lightGBM.csv', index=False)

In [36]:
recommendations_test['true']

0       3360
1        584
2       2625
3       3344
4       1868
        ... 
6035    3216
6036    1583
6037    1040
6038    2354
6039    1177
Name: true, Length: 6040, dtype: int64

# BERT

In [14]:
import pandas as pd
data_BERT = pd.read_csv('recommendations_BERT.csv')


Unnamed: 0,score,user_id,item_id
0,1.46917,3,1560
1,1.349339,3,3529
2,1.271518,3,2327
3,1.225739,3,2784
4,1.182122,3,494


In [40]:
import itertools
top_k = 10
def get_top_k_recommendations(user_id, k=top_k, df=test_df):
    user_predictions = df[df['user_id'] == user_id]
    top_k_items = user_predictions.nlargest(k, 'score')['item_id'].values
    return list(top_k_items)
users = data_BERT['user_id'].unique()
items = data_BERT['item_id'].unique()


# Для каждого пользователя выбираем топ-10 фильмов в тестовом наборе
user_top_k_recommendations_BERT = {user_id: get_top_k_recommendations(user_id, df=data_BERT) for user_id in data_BERT.user_id.unique()}

# Преобразуем результат в DataFrame для теста
recommendations_BERT = pd.DataFrame({
    'user_id': user_top_k_recommendations_BERT.keys(),
    'item_id': user_top_k_recommendations_BERT.values()
})
recommendations_BERT


Unnamed: 0,user_id,item_id
0,3,"[1543, 1223, 1560, 3409, 1315, 132, 3529, 1505..."
1,4,"[2528, 1337, 1560, 2297, 1583, 1039, 3409, 131..."
2,18,"[1583, 1039, 2297, 1543, 3529, 1337, 1560, 181..."
3,22,"[1543, 2528, 3409, 1560, 1315, 132, 1505, 3529..."
4,24,"[1583, 2297, 1811, 1039, 1337, 1560, 2528, 584..."
...,...,...
6035,6007,"[2528, 1583, 1560, 2297, 1831, 132, 2327, 1505..."
6036,6014,"[1543, 1039, 132, 3409, 2327, 1505, 494, 1086,..."
6037,6016,"[472, 1583, 1543, 1811, 1223, 1039, 1337, 1560..."
6038,6023,"[472, 1583, 1543, 3529, 2297, 1223, 1039, 1811..."


In [41]:
data_BERT[data_BERT.score == data_BERT[data_BERT.user_id == 0]['score'].max()]

Unnamed: 0,score,user_id,item_id
68639,1.64473,0,1543


In [42]:

recommendations_BERT = recommendations_BERT.sort_values(by = 'user_id')
recommendations_BERT['item_id'] = recommendations_BERT['item_id'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
recommendations_BERT

Unnamed: 0,user_id,item_id
516,0,1543 472 1811 2784 1086 494 3151 2688 790 1668
2057,1,1039 2528 1315 1811 1505 132 2327 3529 169 3677
2058,2,1543 2528 1560 1039 132 3409 1315 1831 2327 1223
0,3,1543 1223 1560 3409 1315 132 3529 1505 2327 169
1,4,2528 1337 1560 2297 1583 1039 3409 1315 132 2327
...,...,...
4076,6035,1543 1337 2528 132 1811 2327 3529 1315 3409 584
4526,6036,1583 1039 1811 2297 1337 3529 1560 2528 584 1831
4527,6037,1543 1337 472 1560 1583 2528 2297 1315 1811 2327
3548,6038,1543 1583 1337 1315 494 640 476 523 2175 79


In [43]:
recommendations_BERT.to_csv('recommendations_BERT.csv', index=False)

In [44]:
recommendations_BERT.user_id.unique().shape

(6040,)

In [36]:
train_df.user_id.unique().shape

(6040,)