In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import random

# Фиксируем сид для NumPy
np.random.seed(42)

# Фиксируем сид для встроенного модуля random
random.seed(42)

### Распакуем данные в отдельную папку

In [2]:
import os
import zipfile

# Path to the ZIP file
zip_filepath = './hse-rec-sys-challenge-2024.zip'

# Destination folder where the extracted files will be placed
destination_folder = 'data'

# Ensure that the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Unzip the file to the specified folder
with zipfile.ZipFile(zip_filepath, 'r') as zf:
    zf.extractall(path=destination_folder)

### Посмотрим на содержимое

In [3]:
events_df = pd.read_csv('./data/events.csv')
events_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4


In [4]:
events_df[events_df['user_id']==0]

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4
...,...,...,...,...
282,0,1124,3,321
283,0,1809,4,322
284,0,3602,4,323
285,0,2924,3,325


In [5]:
item_features_df = pd.read_csv('./data/item_features.csv')
item_features_df.head()

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

user_features_df = pd.read_csv('./data/user_features.csv')
user_features_df['gender'] = le.fit_transform(user_features_df['gender'])
user_features_df.head()

Unnamed: 0,user_id,gender,age
0,4855,0,1
1,4065,1,56
2,3331,1,25
3,5373,1,45
4,2032,1,25


In [15]:
submission_sample_df = pd.read_csv('./data/submission_sample.csv')
submission_sample_df.head()

Unnamed: 0,user_id,item_id
0,0,0 1 2 3 4 5 6 7 8 9
1,1,0 1 2 3 4 5 6 7 8 9
2,2,0 1 2 3 4 5 6 7 8 9
3,3,0 1 2 3 4 5 6 7 8 9
4,4,0 1 2 3 4 5 6 7 8 9


In [16]:
submission_sample_df.item_id

0       0 1 2 3 4 5 6 7 8 9
1       0 1 2 3 4 5 6 7 8 9
2       0 1 2 3 4 5 6 7 8 9
3       0 1 2 3 4 5 6 7 8 9
4       0 1 2 3 4 5 6 7 8 9
               ...         
6035    0 1 2 3 4 5 6 7 8 9
6036    0 1 2 3 4 5 6 7 8 9
6037    0 1 2 3 4 5 6 7 8 9
6038    0 1 2 3 4 5 6 7 8 9
6039    0 1 2 3 4 5 6 7 8 9
Name: item_id, Length: 6040, dtype: object

### Разобьем выборку на тренировочную, валидационнную и тестовую

In [17]:
from data_split import *
train_df, test_df = split_data_by_user(events_df)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (888109, 4)
Test shape: (6040, 4)


In [18]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
285,0,2924,3,325
284,0,3602,4,323
283,0,1809,4,322
282,0,1124,3,321
281,0,1792,4,320


In [19]:
for column in train_df.columns:
    if train_df[column].dtype in ['int64', 'int32']:
        train_df[column] = train_df[column].astype('int16')
        test_df[column] = test_df[column].astype('int16')

### Негативное сэмплирование

In [26]:
from negative_sampling import *

In [27]:
# Применение функции негативного сэмплирования
negative_samples_df = negative_sampling(train_df)
negative_samples_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1358,0,2311
1,0,1410,0,2311
2,0,3197,0,2311
3,0,898,0,2311
4,0,484,0,2311
...,...,...,...,...
888104,6039,5,0,2311
888105,6039,2290,0,2311
888106,6039,1837,0,2311
888107,6039,1635,0,2311


In [29]:
combined_train_df = pd.concat([train_df, negative_samples_df], ignore_index=True)
# перемешиваем датафрейм, чтобы негативные примеры не были все в конце датафрейма
combined_train_df = combined_train_df.sample(frac=1, random_state=42).reset_index(drop=True)
combined_train_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,1522,3601,0,2311
1,2543,3068,0,2311
2,1337,2868,0,2311
3,3155,3048,0,2311
4,1453,3156,0,2311
...,...,...,...,...
1776213,1756,1640,3,37
1776214,3548,3296,0,2311
1776215,880,1809,4,286
1776216,4508,578,3,96


### Добавляем фичи из двух других таблиц

In [11]:
from feature_additions import *

user_features_from_train = get_user_features_from_train(train_df, item_features_df)
item_features_from_train = get_item_features_from_train(train_df, item_features_df)

# Присоединение признаков к тестовому набору данных
featured_train_df = join_features(train_df, user_features_from_train, item_features_from_train)
featured_test_df = join_features(test_df, user_features_from_train, item_features_from_train)

# Итоговый тестовый датасет с признаками
featured_train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,user_avg_rating,user_num_ratings,user_avg_rating_genre_0,user_avg_rating_genre_1,user_avg_rating_genre_2,user_avg_rating_genre_3,...,user_avg_rating_genre_11,user_avg_rating_genre_12,user_avg_rating_genre_13,user_avg_rating_genre_14,user_avg_rating_genre_15,user_avg_rating_genre_16,user_avg_rating_genre_17,num_genres,movie_avg_rating,movie_num_ratings
0,0,2924,3,325,3.986014,286,0.328671,0.213287,0.090909,0.15035,...,0.062937,0.045455,0.188811,0.164336,0.227273,0.059441,0.013986,1,2.708995,189.0
1,0,3602,4,323,3.986014,286,0.328671,0.213287,0.090909,0.15035,...,0.062937,0.045455,0.188811,0.164336,0.227273,0.059441,0.013986,4,2.735499,431.0
2,0,1809,4,322,3.986014,286,0.328671,0.213287,0.090909,0.15035,...,0.062937,0.045455,0.188811,0.164336,0.227273,0.059441,0.013986,2,3.549116,1018.0
3,0,1124,3,321,3.986014,286,0.328671,0.213287,0.090909,0.15035,...,0.062937,0.045455,0.188811,0.164336,0.227273,0.059441,0.013986,2,3.49497,497.0
4,0,1792,4,320,3.986014,286,0.328671,0.213287,0.090909,0.15035,...,0.062937,0.045455,0.188811,0.164336,0.227273,0.059441,0.013986,1,3.677316,626.0


### Функции для классификации фильмов (выставление оценок)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from models.models_to_generate_candidates import *

classifiers = {
    'LogisticRegression': LogisticRegression(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
}


In [13]:
# у меня обучалось 32 минуты...
trained_models = train_models(featured_train_df, 'rating', classifiers)


Training LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Training GradientBoosting


KeyboardInterrupt: 

In [52]:
import pickle


with open('trained_models.pkl', 'wb') as handle:
    pickle.dump(trained_models, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('trained_models.pkl', 'rb') as handle:
    trained_models = pickle.load(handle)

### Создадим датасет для фильмов, которые не были просмотрены пользователем (таких пар нет в обучающей выборке)

In [20]:
def create_non_seen_films_dataset(df):
    # Создаем копию featured_train_df, чтобы не изменять оригинальный DataFrame
    df = df[['user_id', 'item_id']]
    
    # Получаем список всех уникальных пользователей и фильмов
    unique_users = df["user_id"].unique()
    unique_items = df["item_id"].unique()
    
    # Создаем полный DataFrame с всеми возможными комбинациями пользователей и фильмов
    index = pd.MultiIndex.from_product([unique_users, df["item_id"]], names=["user_id", "item_id"])
    non_seen_films_df = pd.DataFrame(index=index).reset_index()
    
    # Оставляем только те пары, которых нет в тренировочном наборе
    non_seen_films_df = non_seen_films_df[~non_seen_films_df.set_index(["user_id", "item_id"]).index.isin(df.set_index(["user_id", "item_id"]).index)]
    
    return non_seen_films_df

In [23]:
for column in featured_train_df.columns:
    if featured_train_df[column].dtype in ['int64', 'int32']:
        featured_train_df[column] = featured_train_df[column].astype('int16')
        featured_test_df[column] = featured_test_df[column].astype('int16')

In [None]:
non_seen_films_df = create_non_seen_films_dataset(featured_train_df)

In [None]:
non_seen_films_df

### Теперь воспользуемся нашими обученными модельками, чтобы предсказать рейтинг непросмотренных фильмов

In [None]:
trained_models

In [None]:
predictions = []
for model in trained_models:
    predictions.append(make_predictions(non_seen_films_df, trained_models[model]))

In [24]:
non_seen_films_with_predictions = non_seen_films_df[['user_id', 'item_id']].copy()

In [26]:
for i, model in enumerate(trained_models):
    non_seen_films_with_predictions[model] = predictions[i]

In [None]:
non_seen_films_with_predictions.head()

In [None]:
non_seen_films_with_predictions.shape

### Теперь добавим фичи другими моделями

In [None]:
n_users = non_seen_films_with_predictions['user_id'].nunique()
n_items = non_seen_films_with_predictions['item_id'].nunique()

user_item_matrix = np.zeros((n_users, n_items))
for line in featured_train_df.itertuples():
    user_item_matrix[line[1], line[2]] = line[3]
user_item_matrix

In [None]:
def predict_top(user_item_matrix):
    """
    Генерация предсказаний для Top модели 
    """
    num_users, num_items = user_item_matrix.shape
    predictions = []

    # смотрим на популярность фильма
    popularity = np.sum(user_item_matrix > 0, axis=0)

    # нормализация
    min_popularity = np.min(popularity)
    max_popularity = np.max(popularity)
    normalized_popularity = 5 * (popularity - min_popularity) / (max_popularity - min_popularity)

    for user_id in tqdm(range(num_users),desc = 'top_user_score_loading...'):
        for item_id in range(num_items):  
            # в качестве оценки оценка другими пользователями
            predicted_score = normalized_popularity[item_id] 
            predictions.append({'user': user_id, 'item': item_id, 'topn_score': predicted_score})
    return pd.DataFrame(predictions)

In [None]:
topn_scores = predict_top(user_item_matrix)
topn_scores.head()

In [None]:
def predict_lastn(user_item_matrix):
    """
    Генерация предсказаний для LastN модели в формате user | item | score.

    """
    num_users, num_items = user_item_matrix.shape
    predictions = []

    # Предсказания для каждого пользователя на основе последнего взаимодействия
    for user_id in tqdm(range(num_users), desc = 'last_score_prediction...'):
        last_items = np.where(user_item_matrix[user_id] > 0)[0]
        if len(last_items) > 0:
    # последний фильм
            last_item_id = last_items[-1]  
    # Оценка последнего фильма
            last_score = user_item_matrix[user_id, last_item_id]  
            for item_id in range(num_items):
                predicted_score = last_score  
                predictions.append({'user': user_id, 'item': item_id, 'lastn_score': predicted_score})

    return pd.DataFrame(predictions)

In [None]:
last_scores = predict_lastn(user_item_matrix)
last_scores.head()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

from sklearn.neighbors import NearestNeighbors

from tqdm import tqdm
from sklearn.metrics import mean_squared_error

In [None]:
def simple_knn_train(user_item_matrix, n_neighbors=10, metric='cosine'):
    knn_model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, algorithm='brute')
    knn_model.fit(user_item_matrix)
    
    return knn_model

def predict_for_user(knn_model, user_id, user_item_matrix):
    user_data = user_item_matrix[user_id].reshape(1, -1)
    distances, indices = knn_model.kneighbors(user_data, return_distance=True)
    
    return indices, distances

In [None]:
def predict_knn(knn_model, user_item_matrix, n_neighbors=10):
    """
    Предсказания для KNN модели в формате user | item | score.
    """
    num_users, num_items = user_item_matrix.shape
    predictions = []

    for user_id in tqdm(range(num_users),desc = 'user_knn_scores...'):
        user_data = user_item_matrix[user_id].reshape(1, -1)
        distances, indices = knn_model.kneighbors(user_data, n_neighbors=n_neighbors, return_distance=True)

        for item_id in range(num_items):
            # нет оценки пользователя
            if user_item_matrix[user_id, item_id] == 0:  
                neighbor_scores = []
                for neighbor in indices.flatten():
                    if user_item_matrix[neighbor, item_id] > 0:
                        neighbor_scores.append(user_item_matrix[neighbor, item_id])

                if neighbor_scores:
                    predicted_score = np.mean(neighbor_scores)
                    predictions.append({'user': user_id, 'item': item_id, 'knn_score': predicted_score})

    return pd.DataFrame(predictions)

In [None]:
knn_model = simple_knn_train(user_item_matrix, n_neighbors=5)
knn_scores = predict_knn(knn_model, user_item_matrix)
knn_scores.head()

In [None]:
knn_scores = pd.read_csv('knn_scores.csv').iloc[:,1:]
knn_scores

In [None]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
import numpy as np

def train_als_with_surprise(user_item_matrix):
    
    num_users, num_items = user_item_matrix.shape
    data = []
    
    for user_id in tqdm(range(num_users), desc = 'als_user_train'):
        for item_id in range(num_items):
            if user_item_matrix[user_id, item_id] > 0:
                data.append((user_id, item_id, user_item_matrix[user_id, item_id]))
    
    
    reader = Reader(rating_scale=(1, 5))
    dataset = Dataset.load_from_df(pd.DataFrame(data, columns=['user', 'item', 'rating']), reader)
    trainset, testset = train_test_split(dataset, test_size=0.2)
    
# модель ALS (SVD в Surprise)
    als_model = SVD(n_factors=50, n_epochs=20, biased=False) 
    als_model.fit(trainset)
    
    return als_model, testset

def predict_als_with_surprise(als_model, testset):
    predictions = als_model.test(testset)
    return predictions

In [None]:
als_model, testset = train_als_with_surprise(user_item_matrix)
als_predictions = predict_als_with_surprise(als_model, testset)

for pred in als_predictions[:5]:
    print(f'User {pred.uid} predicted rating for Item {pred.iid} is {pred.est}')

In [None]:
als_data = []
for pred in als_predictions:
    als_data.append({
        'user': pred.uid,
        'item': pred.iid,
        'als_score': pred.est
    })

als_scores = pd.DataFrame(als_data)
als_scores

In [None]:
combined_df = topn_scores.merge(last_scores, on=['user', 'item'], how='left')
combined_df = combined_df.merge(knn_scores, on=['user', 'item'], how='left')
combined_df = combined_df.merge(als_scores, on=['user', 'item'], how='left')
combined_df = combined_df.fillna(0)
combined_df.head()

### Рейтинг есть! Теперь выберем топ 100 фильмов для каждого пользователя по всем моделям

In [61]:
def get_top_n(user_item_ratings, model_name, n=100):
    '''Функция возвращает топ-n фильмов для каждого пользователя'''
    
    # Сортируем данные по убыванию предсказанной оценки
    top_n = user_item_ratings.sort_values(model_name, ascending=False)
    
    # Оставляем только первые n строк для каждого пользователя
    top_n = top_n.groupby('user_id').head(n).reset_index(drop=True)
    
    return top_n

In [54]:
def create_top_films_df(non_seen_films_with_predictions, trained_models, n=100):
    '''
    Функция создает два топа по 100 фильмов на основе двух моделей
    '''
    
    top_films = {}
    for model in trained_models:
        top_films[model] = get_top_n(
            non_seen_films_with_predictions, model, n=n
        )
    
    # Объединяем результаты
    result = pd.concat(list(top_films.values()))
    
    return result

In [43]:
top_films_df = create_top_films_df(non_seen_films_with_predictions, trained_models)

In [None]:
top_films_df

In [None]:
top_films_df = top_films_df.drop_duplicates()
top_films_df.shape

### Теперь нужно добавить в этот датафрейм фичи

In [None]:
featured_top_films_df = add_features_to_train_data(
    top_films_df, user_features_df, item_features_df)
featured_top_films_df

### Кажется финишная прямая... Теперь предскажем топ 10 фильмов для каждого пользователя

In [71]:
from sklearn.linear_model import LinearRegression

# Создаем новую колонку с усредненной оценкой от двух моделей
featured_top_films_df['average_prediction'] = (sum([featured_top_films_df[model_name] for model_name in trained_models])) / len(trained_models)

# Выделяем необходимые признаки для обучения модели
X = featured_top_films_df.drop(['average_prediction'], axis=1)
y = featured_top_films_df['average_prediction']

In [72]:
# Обучаем линейную регрессию
lr = LinearRegression()
lr.fit(X, y)

# Применяем обученную модель для предсказаний
predictions = lr.predict(X)

# Добавляем предсказанные значения в датафрейм
featured_top_films_df['final_prediction'] = predictions

In [73]:
top_10_films = get_top_n(featured_top_films_df, 'final_prediction', n=10)[['user_id', 'item_id']]

# Экспортируем результат в CSV-файл
top_10_films.to_csv('top_10_films.csv', index=False)

In [None]:
top_10_films

In [None]:
top_10_films.user_id.nunique()

### Осталось привести к нужному формату сабмита

In [76]:
def format_for_submission(df):
    # Группируем строки по user_id и соединяем item_id через пробел
    submission = (
        df
        .groupby('user_id')['item_id']
        .apply(lambda x: ' '.join(x.astype(str)))
        .reset_index()
    )
    
    return submission

In [None]:
submission = format_for_submission(top_10_films)
submission

In [78]:
submission.to_csv('submission.csv', index=False)