In [5]:

import os
from utils.data_split import *
from utils.feature_additions import *

path_to_data = 'data'
path_to_submissions = 'submissions'

data_path = 'data'
events_df = pd.read_csv('data/events.csv')  # [user_id, item_id, rating, timestamp]

user_features_df = pd.read_csv(os.path.join(data_path,'user_features.csv')) # [user_id, gender, age]
item_features_df = pd.read_csv('./data/item_features.csv')

train_df, test_df = split_data_by_user(events_df, test_size=10)

train_df = train_df.drop(['timestamp'], axis = 1)
test_df = test_df.drop(['timestamp'], axis = 1)

user_features_from_train = get_user_features_from_train(train_df, item_features_df)
item_features_from_train = get_item_features_from_train(train_df, item_features_df)

# Присоединение признаков к тестовому набору данных
train_df_with_features = join_features(train_df, user_features_from_train, item_features_from_train)
test_df_with_features = join_features(test_df, user_features_from_train, item_features_from_train)



target_col = ['rating']
X_train, y_train = train_df_with_features.drop(target_col, axis = 1), train_df_with_features[target_col]
X_test, y_test = test_df_with_features.drop(target_col, axis = 1), test_df_with_features[target_col]


# Light GBM

In [6]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

# Параметры модели
params = {
    'objective': 'regression',      # Можно использовать 'lambdarank' для ранжирования
    'metric': 'rmse',               # Метрика регрессии, для ранжирования можно использовать NDCG
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

# Обучение модели
model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_test], num_boost_round=1000)

# Предсказание на валидации для оценки
y_pred_val = model.predict(X_test, num_iteration=model.best_iteration)

from sklearn.metrics import mean_squared_error
rmse_val = mean_squared_error(y_test, y_pred_val, squared=False)
print(f"RMSE на валидационных данных: {rmse_val}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6119
[LightGBM] [Info] Number of data points in the train set: 833749, number of used features: 25
[LightGBM] [Info] Start training from score 3.579519
RMSE на валидационных данных: 0.9459961927251281




In [7]:
import itertools

top_k = 10

def get_top_k_recommendations(user_id, k=top_k, df=test_df):
    user_predictions = df[df['user_id'] == user_id]
    top_k_items = user_predictions.nlargest(k, 'predicted_rating')['item_id'].values
    return list(top_k_items)

def make_candidates_generations(data: pd.DataFrame,model = model,  model_name : str = 'LightGBM', k = 10):
    users = data['user_id'].unique()
    items = data['item_id'].unique()
    all_combinations = pd.DataFrame(list(itertools.product(users, items)), columns=['user_id', 'item_id'])
    all_combinations_featured = join_features(all_combinations, user_features_from_train, item_features_from_train)
    if model_name == 'LightGBM':
        all_combinations_featured['predicted_rating'] = model.predict(all_combinations_featured, num_iteration=model.best_iteration)
    user_top_k_recommendations = {user_id: get_top_k_recommendations(user_id, df=all_combinations_featured) for user_id in users}

    recommendations = pd.DataFrame({
    'user_id': user_top_k_recommendations.keys(),
    'recommended_items': user_top_k_recommendations.values()
    })

    return recommendations



In [8]:
model.save_model('LightGBM.txt')
recommendations_LGBM = make_candidates_generations(train_df, model = model, model_name='LightGBM', k = 10)
recommendations_LGBM['recommended_items'] = recommendations_LGBM['recommended_items'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
recommendations_LGBM

Unnamed: 0,user_id,recommended_items
0,0,2649 2627 2243 1575 2111 3387 896 900 3136 1455
1,1,3022 1543 2649 2627 2243 1575 3136 2111 896 900
2,2,3022 1039 1543 1583 2564 2862 1831 472 1223 1560
3,3,3022 1543 1583 1039 2789 2649 2627 2243 3529 1575
4,4,1575 2649 2627 2243 2111 1543 896 900 1455 3136
...,...,...
6035,6035,896 900 1575 2243 2111 2649 2627 1455 3541 3563
6036,6036,1455 1575 2111 896 900 2649 2627 2243 3541 3563
6037,6037,3541 3563 3387 3136 2649 2627 2243 2111 1455 1575
6038,6038,2243 3541 3563 896 900 1575 2111 1455 2649 2627


# Метрика

## Собираем датасет эталон

In [10]:
import os
from utils.data_split import *
from utils.feature_additions import *

data_path = 'data'
events_df = pd.read_csv('data/events.csv')  # [user_id, item_id, rating, timestamp]

user_features_df = pd.read_csv(os.path.join(data_path,'user_features.csv')) # [user_id, gender, age]
item_features_df = pd.read_csv('./data/item_features.csv')

train_df, test_df = split_data_by_user(events_df, test_size=10)

In [11]:
dataset_fo_recall = test_df.groupby(['user_id'])['item_id'].apply(list).reset_index(name='last_10_interactions')
dataset_fo_recall['last_10_interactions'] = dataset_fo_recall['last_10_interactions'].apply(lambda x: [str(a) for a in x]).apply(lambda x: ' '.join(x))
dataset_fo_recall.to_csv('dataset_to_recall.csv', index = False)


In [12]:
from utils.recall_at_k import *
submission_file_path = 'submissions/recommendations_lightGBM.csv'
real_interactions_file_path = 'data/dataset_to_recall.csv'

dataset_for_recall = pd.read_csv(real_interactions_file_path)
submission_df = pd.read_csv(submission_file_path)

submission_df['item_id'] = submission_df['item_id'].apply(lambda x: x.split())
dataset_for_recall['last_10_interactions'] = dataset_for_recall['last_10_interactions'].apply(lambda x: x.split())

submission_df['y_real']  = dataset_for_recall['last_10_interactions']

result = recall_at_k_overall(submission_df, actual_col='y_real', predicted_col='item_id')
print(f"Recall@10 = {result:.4f}")

Recall@10 = 0.0138
