In [1]:

import os
from data_split import *
from feature_additions import *

path_to_data = 'data'
path_to_submissions = 'submissions'

data_path = 'data'
events_df = pd.read_csv('data/events.csv')  # [user_id, item_id, rating, timestamp]

user_features_df = pd.read_csv(os.path.join(data_path,'user_features.csv')) # [user_id, gender, age]
item_features_df = pd.read_csv('./data/item_features.csv')

train_df, val_df, test_df = split_data_by_user(events_df, test_size=10)

user_features_from_train = get_user_features_from_train(train_df, item_features_df)
item_features_from_train = get_item_features_from_train(train_df, item_features_df)

# Присоединение признаков к тестовому набору данных
train_df_with_features = join_features(train_df, user_features_from_train, item_features_from_train)
test_df_with_features = join_features(test_df, user_features_from_train, item_features_from_train)
val_df_with_features = join_features(val_df, user_features_from_train, item_features_from_train)


target_col = ['rating']
X_train, y_train = train_df_with_features.drop(target_col, axis = 1), train_df_with_features[target_col]
X_test, y_test = test_df_with_features.drop(target_col, axis = 1), test_df_with_features[target_col]
X_val, y_val = val_df_with_features.drop(target_col, axis = 1), val_df_with_features[target_col]

# Light GBM

In [2]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

# Параметры модели
params = {
    'objective': 'regression',      # Можно использовать 'lambdarank' для ранжирования
    'metric': 'rmse',               # Метрика регрессии, для ранжирования можно использовать NDCG
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

# Обучение модели
model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=1000)

# Предсказание на валидации для оценки
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

from sklearn.metrics import mean_squared_error
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"RMSE на валидационных данных: {rmse_val}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6121
[LightGBM] [Info] Number of data points in the train set: 827709, number of used features: 25
[LightGBM] [Info] Start training from score 3.579057
RMSE на валидационных данных: 0.9378183238623738




In [3]:
import itertools

top_k = 10

def get_top_k_recommendations(user_id, k=top_k, df=test_df):
    user_predictions = df[df['user_id'] == user_id]
    top_k_items = user_predictions.nlargest(k, 'predicted_rating')['item_id'].values
    return list(top_k_items)

def make_candidates_generations(data: pd.DataFrame,model = model,  model_name : str = 'LightGBM', k = 10):
    users = data['user_id'].unique()
    items = data['item_id'].unique()
    all_combinations = pd.DataFrame(list(itertools.product(users, items)), columns=['user_id', 'item_id'])
    all_combinations_featured = join_features(all_combinations, user_features_from_train, item_features_from_train)
    if model_name == 'LightGBM':
        all_combinations_featured['predicted_rating'] = model.predict(all_combinations_featured, num_iteration=model.best_iteration)
    user_top_k_recommendations = {user_id: get_top_k_recommendations(user_id, df=all_combinations_featured) for user_id in users}

    recommendations = pd.DataFrame({
    'user_id': user_top_k_recommendations.keys(),
    'recommended_items': user_top_k_recommendations.values()
    })

    return recommendations



In [9]:
model.save_model('LightGBM.txt')
recommendations_LGBM = make_candidates_generations(train_df, model = model, model_name='LightGBM', k = 10)
recommendations_LGBM['item_id'] = recommendations_LGBM['item_id'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
recommendations_LGBM

KeyError: 'item_id'

# BERT

In [8]:
import pandas as pd
data_BERT = pd.read_csv('output.csv')


FileNotFoundError: [Errno 2] No such file or directory: 'output.csv'

In [9]:
import itertools
top_k = 10
def get_top_k_recommendations(user_id, k=top_k, df=test_df):
    user_predictions = df[df['user_id'] == user_id]
    top_k_items = user_predictions.nlargest(k, 'score')['item_id'].values
    return list(top_k_items)
users = data_BERT['user_id'].unique()
items = data_BERT['item_id'].unique()


# Для каждого пользователя выбираем топ-10 фильмов в тестовом наборе
user_top_k_recommendations_BERT = {user_id: get_top_k_recommendations(user_id, df=data_BERT) for user_id in data_BERT.user_id.unique()}

# Преобразуем результат в DataFrame для теста
recommendations_BERT = pd.DataFrame({
    'user_id': user_top_k_recommendations_BERT.keys(),
    'item_id': user_top_k_recommendations_BERT.values()
})
recommendations_BERT


NameError: name 'data_BERT' is not defined

In [10]:
data_BERT[data_BERT.score == data_BERT[data_BERT.user_id == 0]['score'].max()]

NameError: name 'data_BERT' is not defined

In [11]:

recommendations_BERT = recommendations_BERT.sort_values(by = 'user_id')
recommendations_BERT['item_id'] = recommendations_BERT['item_id'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
recommendations_BERT

NameError: name 'recommendations_BERT' is not defined

In [12]:
recommendations_BERT.to_csv('recommendations_BERT_2.csv', index=False)

NameError: name 'recommendations_BERT' is not defined

In [13]:
recommendations_BERT.user_id.unique().shape

NameError: name 'recommendations_BERT' is not defined

In [14]:
train_df.user_id.unique().shape

(6040,)

# Метрика

## Собираем датасет эталон

In [25]:
import os
from data_split import *
from feature_additions import *

data_path = 'data'
events_df = pd.read_csv('data/events.csv')  # [user_id, item_id, rating, timestamp]

user_features_df = pd.read_csv(os.path.join(data_path,'user_features.csv')) # [user_id, gender, age]
item_features_df = pd.read_csv('./data/item_features.csv')

train_df, val_df, test_df = split_data_by_user(events_df, test_size=10)

In [26]:
dataset_fo_recall = test_df.groupby(['user_id'])['item_id'].apply(list).reset_index(name='last_10_interactions')
dataset_fo_recall['last_10_interactions'] = dataset_fo_recall['last_10_interactions'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
dataset_fo_recall.to_csv('dataset_to_recall.csv', index = False)


## Cобираем датасет с сабмитом

In [27]:
import pandas as pd

submission_file_path = 'recommendations_BERT_2.csv'
real_interactions_file_path = 'dataset_to_recall.csv'

submission_df = pd.read_csv(submission_file_path)
submission_df['item_id'] = submission_df['item_id'].apply(lambda x: x.split())
real_interactions_df = pd.read_csv(real_interactions_file_path)
real_interactions_df['last_10_interactions'] = real_interactions_df['last_10_interactions'].apply(lambda x: x[1:-1].split())

submission_df['y_real']  = real_interactions_df['last_10_interactions']



In [28]:
from recall_at_k import *
print(recall_at_k_overall(df = submission_df, actual_col='y_real', predicted_col='item_id'))
# print(recall_at_k_mean(df = submission_df, actual_col='y_real', predicted_col='item_id'))


0.020047716879846245
0.020042310522442975
