In [1]:
!pip install lightgbm



In [2]:
import os
from data_split import *
from feature_additions import *

data_path = 'data'
events_df = pd.read_csv('data/events.csv')  # [user_id, item_id, rating, timestamp]
user_features_df = pd.read_csv(os.path.join(data_path,'user_features.csv')) # [user_id, gender, age]
item_features_df = pd.read_csv('./data/item_features.csv')

train_df, val_df, test_df = split_data_by_user(events_df)

user_features_from_train = get_user_features_from_train(train_df, item_features_df)
item_features_from_train = get_item_features_from_train(train_df, item_features_df)

# Присоединение признаков к тестовому набору данных
train_df_with_features = join_features_to_test(train_df, user_features_from_train, item_features_from_train)
test_df_with_features = join_features_to_test(test_df, user_features_from_train, item_features_from_train)
val_df_with_features = join_features_to_test(val_df, user_features_from_train, item_features_from_train)


target_col = ['rating']
X_train, y_train = train_df_with_features.drop(target_col, axis = 1), train_df_with_features[target_col]
X_test, y_test = test_df_with_features.drop(target_col, axis = 1), test_df_with_features[target_col]
X_val, y_val = val_df_with_features.drop(target_col, axis = 1), val_df_with_features[target_col]



In [8]:
import lightgbm as lgb

lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

# Параметры модели
params = {
    'objective': 'regression',      # Можно использовать 'lambdarank' для ранжирования
    'metric': 'rmse',               # Метрика регрессии, для ранжирования можно использовать NDCG
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
}

# Обучение модели
model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=1000)

# Предсказание на валидации для оценки
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

from sklearn.metrics import mean_squared_error
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(f"RMSE на валидационных данных: {rmse_val}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6374
[LightGBM] [Info] Number of data points in the train set: 857909, number of used features: 26
[LightGBM] [Info] Start training from score 3.580388
RMSE на валидационных данных: 0.9336433451801084




In [15]:
import itertools
top_k = 10
def get_top_k_recommendations(user_id, k=top_k, df=test_df):
    user_predictions = df[df['user_id'] == user_id]
    top_k_items = user_predictions.nlargest(k, 'predicted_rating')['item_id'].values
    return list(top_k_items)
users = test_df['user_id'].unique()
items = test_df['item_id'].unique()

all_combinations_test = pd.DataFrame(list(itertools.product(users, items)), columns=['user_id', 'item_id'])
all_combinations_test = join_features_to_test(all_combinations_test, user_features_from_train, item_features_from_train)
X_test, y_test = test_df_with_features.drop(target_col, axis = 1), test_df_with_features[target_col]
test_df['predicted_rating'] = model.predict(X_test, num_iteration=model.best_iteration)

# Для каждого пользователя выбираем топ-10 фильмов в тестовом наборе
user_top_k_recommendations_test = {user_id: get_top_k_recommendations(user_id, df=test_df) for user_id in test_df.user_id}

# Преобразуем результат в DataFrame для теста
recommendations_test = pd.DataFrame({
    'user_id': user_top_k_recommendations_test.keys(),
    'recommended_items': user_top_k_recommendations_test.values()
})

# Сохраняем результат для тестового набора
recommendations_test.to_csv('recommendations_test.csv', index=False)
