In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

### Распакуем данные в отдельную папку

In [4]:
import os
import zipfile

# Path to the ZIP file
zip_filepath = './hse-rec-sys-challenge-2024.zip'

# Destination folder where the extracted files will be placed
destination_folder = 'data'

# Ensure that the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Unzip the file to the specified folder
with zipfile.ZipFile(zip_filepath, 'r') as zf:
    zf.extractall(path=destination_folder)

### Посмотрим на содержимое

In [5]:
events_df = pd.read_csv('./data/events.csv')
events_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4


In [6]:
events_df[events_df['user_id']==0]

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4
...,...,...,...,...
282,0,1124,3,321
283,0,1809,4,322
284,0,3602,4,323
285,0,2924,3,325


In [7]:
item_features_df = pd.read_csv('./data/item_features.csv')
item_features_df.head()

Unnamed: 0,item_id,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
4,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

user_features_df = pd.read_csv('./data/user_features.csv')
user_features_df['gender'] = le.fit_transform(user_features_df['gender'])
user_features_df.head()

Unnamed: 0,user_id,gender,age
0,4855,0,1
1,4065,1,56
2,3331,1,25
3,5373,1,45
4,2032,1,25


In [9]:
submission_sample_df = pd.read_csv('./data/submission_sample.csv')
submission_sample_df.head()

Unnamed: 0,user_id,item_id
0,0,0 1 2 3 4 5 6 7 8 9
1,1,0 1 2 3 4 5 6 7 8 9
2,2,0 1 2 3 4 5 6 7 8 9
3,3,0 1 2 3 4 5 6 7 8 9
4,4,0 1 2 3 4 5 6 7 8 9


In [10]:
submission_sample_df.item_id

0       0 1 2 3 4 5 6 7 8 9
1       0 1 2 3 4 5 6 7 8 9
2       0 1 2 3 4 5 6 7 8 9
3       0 1 2 3 4 5 6 7 8 9
4       0 1 2 3 4 5 6 7 8 9
               ...         
6035    0 1 2 3 4 5 6 7 8 9
6036    0 1 2 3 4 5 6 7 8 9
6037    0 1 2 3 4 5 6 7 8 9
6038    0 1 2 3 4 5 6 7 8 9
6039    0 1 2 3 4 5 6 7 8 9
Name: item_id, Length: 6040, dtype: object

### Разобьем выборку на тренировочную, валидационнную и тестовую

In [11]:
import pandas as pd

def split_data_by_user(df, test_size=1, val_size=1):
    """
    Функция разделения данных на обучающую, валидационную и тестовую выборки.
    
    :param df: DataFrame с данными
    :param test_size: размер тестового набора (по умолчанию 1)
    :param val_size: размер валидационного набора (по умолчанию 1)
    :return: три DataFrame: train_df, val_df, test_df
    """
    # Группируем данные по каждому пользователю
    grouped = df.groupby('user_id')
    
    # Список для хранения индексов каждой группы
    train_indices = []
    val_indices = []
    test_indices = []
    
    # Проходимся по каждой группе
    for _, group in grouped:
        # Сортируем группу по timestamp
        sorted_group = group.sort_values(by='timestamp', ascending=False)
        
        # Получаем индексы для каждой выборки
        test_idx = sorted_group.index[:test_size]
        val_idx = sorted_group.index[test_size:test_size + val_size]
        train_idx = sorted_group.index[test_size + val_size:]
        
        # Добавляем индексы в соответствующие списки
        train_indices.extend(train_idx)
        val_indices.extend(val_idx)
        test_indices.extend(test_idx)
    
    # Формируем DataFrames для каждой выборки
    train_df = df.loc[train_indices].copy()
    val_df = df.loc[val_indices].copy()
    test_df = df.loc[test_indices].copy()
    
    return train_df, val_df, test_df

# Пример использования функции
train_df, val_df, test_df = split_data_by_user(events_df)

print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (882069, 4)
Val shape: (6040, 4)
Test shape: (6040, 4)


In [12]:
train_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
284,0,3602,4,323
283,0,1809,4,322
282,0,1124,3,321
281,0,1792,4,320
280,0,1773,2,319


### Добавляем фичи из двух других таблиц

In [13]:
def add_features_to_train_data(df, user_features_df, item_features_df):
    # Слияние с таблицей признаков пользователей
    merged_with_users = pd.merge(
        left=df,
        right=user_features_df,
        on="user_id",
        how="left"
    )
    
    # Слияние с таблицей признаков фильмов
    final_merged = pd.merge(
        left=merged_with_users,
        right=item_features_df,
        on="item_id",
        how="left"
    )
    
    return final_merged

In [32]:
featured_train_df = add_features_to_train_data(train_df, user_features_df, item_features_df).drop(['timestamp'], axis=1)
featured_val_df = add_features_to_train_data(val_df, user_features_df, item_features_df).drop(['timestamp'], axis=1)
featured_test_df = add_features_to_train_data(test_df, user_features_df, item_features_df).drop(['timestamp'], axis=1)

featured_train_df.head()

Unnamed: 0,user_id,item_id,rating,gender,age,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,3602,4,1,35,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,0,1809,4,1,35,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,1124,3,1,35,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1792,4,1,35,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1773,2,1,35,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0


### Функции для классификации фильмов (выставление оценок)

In [33]:
from sklearn.linear_model import LogisticRegression

def train_classifier(train_df, target_column, classifier):
    # Разделяем данные на признаки (X) и цели (y)
    X = train_df.drop(target_column, axis=1)
    y = train_df[target_column]
        
    # Обучаем классификатор
    model = classifier.fit(X, y)
    
    return model

def make_predictions(features_df, model):
    predictions = model.predict(features_df)
    return predictions

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

classifiers = {
    'LogisticRegression': LogisticRegression(),
    # 'RandomForest': RandomForestClassifier(),
    # 'SupportVectorMachine': SVC(),
    'GradientBoosting': GradientBoostingClassifier()
}

def train_and_evaluate(train_df, test_df, target_column):
    trained_models = {}
    for name, classifier in classifiers.items():
        print(f"\nTraining {name}:")
        trained_model = train_classifier(train_df, target_column, classifier)
        trained_models[name] = trained_model
        predictions = make_predictions(test_df.drop([target_column], axis=1), trained_model)
        accuracy = accuracy_score(test_df[target_column], predictions)
        print(f"{name} Accuracy: {accuracy:.2%}")

    return trained_models

In [None]:
# у меня обучалось 12 минут...
trained_models = train_and_evaluate(featured_train_df, featured_val_df, 'rating')


Training LogisticRegression:


### Создадим датасет для фильмов, которые не были просмотрены пользователем (таких пар нет в обучающей выборке)

In [18]:
def create_non_seen_films_dataset(featured_train_df, user_features_df, item_features_df):
    # Создаем копию featured_train_df, чтобы не изменять оригинальный DataFrame
    df = featured_train_df.copy()
    
    # Получаем список всех уникальных пользователей и фильмов
    unique_users = df["user_id"].unique()
    unique_items = df["item_id"].unique()
    
    # Создаем полный DataFrame с всеми возможными комбинациями пользователей и фильмов
    index = pd.MultiIndex.from_product([unique_users, item_features_df["item_id"]], names=["user_id", "item_id"])
    non_seen_films_df = pd.DataFrame(index=index).reset_index()
    
    # Добавляем признаки пользователей
    non_seen_films_df = non_seen_films_df.merge(user_features_df, on="user_id", how="left")
    
    # Добавляем признаки фильмов
    non_seen_films_df = non_seen_films_df.merge(item_features_df, on="item_id", how="left")
    
    # Оставляем только те пары, которых нет в тренировочном наборе
    non_seen_films_df = non_seen_films_df[~non_seen_films_df.set_index(["user_id", "item_id"]).index.isin(df.set_index(["user_id", "item_id"]).index)]
    
    return non_seen_films_df

In [19]:
non_seen_films_df = create_non_seen_films_dataset(featured_train_df, user_features_df, item_features_df)

In [20]:
non_seen_films_df

Unnamed: 0,user_id,item_id,gender,age,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,...,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17
0,0,0,1,35,0,1,0,1,1,0,...,1,0,0,0,0,1,0,0,0,0
1,0,1,1,35,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,1,35,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,3,1,35,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,4,1,35,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22384234,6039,3700,1,35,0,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
22384235,6039,3701,1,35,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
22384236,6039,3702,1,35,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
22384237,6039,3703,1,35,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
non_seen_films_df_no_timestamp = non_seen_films_df.drop('timestamp', axis=1)

KeyError: "['timestamp'] not found in axis"

### Теперь воспользуемся нашими обученными модельками, чтобы предсказать рейтинг непросмотренных фильмов

In [22]:
trained_models

{'LogisticRegression': LogisticRegression(),
 'GradientBoosting': GradientBoostingClassifier()}

In [29]:
predictions = []
for model in trained_models:
    predictions.append(make_predictions(non_seen_films_df_no_timestamp, trained_models[model]))

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- timestamp
