In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm


### Загрузка данных

In [2]:
pd.set_option("display.max_columns", None)

games_df = pd.read_parquet("source/games.parquet")

### Объединение текстовых данных

In [3]:
games_df.replace('Unknown', np.nan, inplace=True)

games_df['combined_text'] = games_df[['game_description', 'reviews_summary', 'popular_tags', 'game_features']].fillna('').agg(' '.join, axis=1).str.lower()

### Преобразование цен в числовой формат

In [4]:
games_df['price'] = games_df['original_price'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).replace('Free', '0').astype(float)
games_df.reset_index(drop=True, inplace=True)
games_df

Unnamed: 0,title,original_price,release_date,game_description,reviews_summary,developer,publisher,supported_languages,popular_tags,game_features,minimum_requirements,normalized_title,combined_text,price
0,Baldur's Gate 3,$29.99,2023-08-03,"Baldur’s Gate 3 is a story-rich, party-based R...",Very Positive,Larian Studios,Larian Studios,"['English', 'French', 'German', 'Spanish - Spa...","['RPG', 'Choices Matter', 'Character Customiza...","['Single-player', 'Online Co-op', 'LAN Co-op',...",Requires a 64-bit processor and operating syst...,baldurs gate 3,"baldur’s gate 3 is a story-rich, party-based r...",29.99
1,Counter-Strike: Global Offensive,$14.99,2012-08-21,Counter-Strike: Global Offensive (CS: GO) expa...,Very Positive,"Valve, Hidden Path Entertainment",Valve,"['English', 'Czech', 'Danish', 'Dutch', 'Finni...","['FPS', 'Shooter', 'Multiplayer', 'Competitive...","['Steam Achievements', 'Full controller suppor...",OS: | Windows® 7/Vista/XP | Processor: | Int...,counterstrike global offensive,counter-strike: global offensive (cs: go) expa...,14.99
2,Apex Legends™,Free,2020-11-04,"Apex Legends is the award-winning, free-to-pla...",Very Positive,Respawn Entertainment,Electronic Arts,"['English', 'French', 'Italian', 'German', 'Sp...","['Free to Play', 'Multiplayer', 'Battle Royale...","['Online PvP', 'Online Co-op', 'Steam Achievem...",Requires a 64-bit processor and operating syst...,apex legends,"apex legends is the award-winning, free-to-pla...",0.00
3,Forza Horizon 5,$34.78,2021-11-08,Your Ultimate Horizon Adventure awaits! Explor...,Very Positive,Playground Games,Xbox Game Studios,"['English', 'French', 'Italian', 'German', 'Sp...","['Racing', 'Open World', 'Driving', 'Multiplay...","['Single-player', 'Online PvP', 'Online Co-op'...",Requires a 64-bit processor and operating syst...,forza horizon 5,your ultimate horizon adventure awaits! explor...,34.78
4,Call of Duty®,Free,2022-10-27,"Welcome to Call of Duty® HQ, the home of Call ...",Mixed,"Infinity Ward, Raven Software, Beenox, Treyarc...",Activision,"['English', 'French', 'Italian', 'German', 'Sp...","['FPS', 'Multiplayer', 'Shooter', 'Action', 'S...","['Single-player', 'Online PvP', 'Online Co-op'...",Requires a 64-bit processor and operating syst...,call of duty,"welcome to call of duty® hq, the home of call ...",0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60024,Cruo Domine,Free,2023-04-21,Survive in a hostile space environment with ra...,,Jiri Otoupal,Jiri Otoupal,['English'],"['Early Access', 'Action', 'Adventure', 'RPG',...","['Single-player', 'Steam Achievements', 'Track...",Requires a 64-bit processor and operating syst...,cruo domine,survive in a hostile space environment with ra...,0.00
60025,Retchid,$12.49,2023-08-18,You are a TEC Engineer from Earth ordered to r...,,From Beneath Software,From Beneath Software,['English'],"['Early Access', 'Action', 'FPS', 'Shooter', '...","['Single-player', 'Partial Controller Support']",Requires a 64-bit processor and operating syst...,retchid,you are a tec engineer from earth ordered to r...,12.49
60026,Dungeon Legend,$14.49,2023-04-01,This is a casual breakout game. The pace of th...,,Louis Winfield Game Studios,Louis Winfield Game Studios,['English'],"['Casual', 'Rhythm', '3D Platformer', '2.5D', ...",['Single-player'],OS: | windows 7/8/10/11 | Processor: | i5 | ...,dungeon legend,this is a casual breakout game. the pace of th...,14.49
60027,Sea of Stars,Free,2023-08-29,Sea of Stars is a turn-based RPG inspired by t...,,Sabotage Studio,Sabotage Studio,"['English', 'French', 'German', 'Spanish - Spa...","['RPG', 'Pixel Graphics', 'Adventure', 'Turn-B...","['Single-player', 'Steam Achievements', 'Full ...","OS: | Windows 7, Windows 10 | Processor: | I...",sea of stars,sea of stars is a turn-based rpg inspired by t...,0.00


### Бейзлайн: Рекомендации по популярности (по количеству отзывов)

In [5]:
def baseline_recommendations(df, top_n=10):
    df_sorted_by_reviews = df.sort_values(by='review_encoded', ascending=False)
    
    return df_sorted_by_reviews[['title', 'original_price', 'reviews_summary', 'developer']].head(top_n)

## Оценка модели

In [6]:
def mean_reciprocal_rank(actual, predicted):
    ranks = []
    for a, p in zip(actual, predicted):
        rank = next((i + 1 for i, val in enumerate(p) if val == a), None)
        ranks.append(1 / rank if rank else 0)
    return np.mean(ranks)

def precision_at_k(actual, predicted, k):
    precision = 0
    for a, p in zip(actual, predicted):
        relevant = len([i for i in p[:k] if i == a])
        precision += relevant / k
    return precision / len(actual)

def recall_at_k(actual, predicted, k):
    recall = 0
    for a, p in zip(actual, predicted):
        relevant = len([i for i in p[:k] if i == a])
        recall += relevant / len(p[:k])
    return recall / len(actual)



### Оценка основной модели

In [7]:
def evaluate_model(test_df, top_n=10):
    actual_recommendations = []
    predicted_recommendations = []

    for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
        game_title = row['title']
        
        # Получаем рекомендации для игры
        recommended = recommend(game_title, top_n)
        
        # Сохраняем актуальные и предсказанные игры
        actual_recommendations.append(game_title)
        predicted_recommendations.append(recommended['title'].values.tolist())

    # Рассчитываем MRR, Precision, Recall
    mrr = mean_reciprocal_rank(actual_recommendations, predicted_recommendations)
    precision = precision_at_k(actual_recommendations, predicted_recommendations, top_n)
    recall = recall_at_k(actual_recommendations, predicted_recommendations, top_n)

    return mrr, precision, recall



### Оценка бейзлайна

In [8]:
def evaluate_baseline(test_df, top_n=10):
    actual_recommendations = []
    predicted_recommendations = []

    for i, row in tqdm(test_df.iterrows(), total=len(test_df)):
        game_title = row['title']
        
        # Получаем рекомендации по бейзлайну для каждой игры
        recommended = baseline_recommendations(test_df, top_n)
        
        # Находим индекс текущей игры в списке рекомендаций
        recommended_titles = recommended['title'].values.tolist()
        actual_recommendations.append(game_title)
        predicted_recommendations.append(recommended_titles)

    # Рассчитываем MRR, Precision, Recall для бейзлайна
    mrr = mean_reciprocal_rank(actual_recommendations, predicted_recommendations)
    precision = precision_at_k(actual_recommendations, predicted_recommendations, top_n)
    recall = recall_at_k(actual_recommendations, predicted_recommendations, top_n)

    return mrr, precision, recall


### Разделяем данные на обучающую и тестовую выборку

In [9]:
train_idx, test_idx = train_test_split(games_df.index, test_size=0.2, random_state=42)
train_df = games_df.loc[train_idx].copy()
test_df = games_df.loc[test_idx].copy()


### Обучи Doc2Vec на train_df

In [10]:
train_documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(train_df['combined_text'].astype(str))]
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=40)
doc2vec_model.build_vocab(train_documents)
doc2vec_model.train(train_documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)


In [11]:
# Вектора для train
train_doc_vectors = np.array([doc2vec_model.infer_vector(doc.words) for doc in train_documents])

# Вектора для test
test_documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(test_df['combined_text'].astype(str))]
test_doc_vectors = np.array([doc2vec_model.infer_vector(doc.words) for doc in test_documents])

### Оценка основной модели

In [12]:
mrr, precision, recall = evaluate_model(test_df, top_n=10)
print(f'Основная модель - MRR: {mrr:.2f}')
print(f'Основная модель - Precision: {precision:.2f}')
print(f'Основная модель - Recall: {recall:.2f}')



Основная модель - MRR: 43
Основная модель - Precision: 42.34
Основная модель - Recall: 67.04


### Оценка бейзлайна

In [60]:
mrr_baseline, precision_baseline, recall_baseline = evaluate_baseline(test_df, top_n=10)
print(f'Бейзлайн - MRR: {mrr_baseline:.2f}')
print(f'Бейзлайн - Precision: {precision_baseline:.2f}')
print(f'Бейзлайн - Recall: {recall_baseline:.2f}')

100%|███████████████████████████████████████████████████████████████████████████| 12006/12006 [00:20<00:00, 585.34it/s]

Бейзлайн - MRR: 0.00024395870847644958
Бейзлайн - Precision: 8.329168748958853e-05
Бейзлайн - Recall: 0.00





## Контентная модель на Doc2Vec:
### Токенизация и обучение модели Doc2Vec


In [44]:
games_df.reset_index(drop=True, inplace=True)

documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(games_df['combined_text'].astype(str))]
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=40)
doc2vec_model.build_vocab(documents)
doc2vec_model.train(documents, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)


### Векторизация описаний

In [45]:
doc_vectors = np.array([doc2vec_model.infer_vector(doc.words) for doc in documents])

## Простая "коллаборативная" модель на признаках:
### Преобразование категориальных признаков

In [7]:
encoder = LabelEncoder()
games_df['developer_encoded'] = encoder.fit_transform(games_df['developer'])
games_df['review_encoded'] = encoder.fit_transform(games_df['reviews_summary'])

### Преобразуем числовые признаки

In [8]:
meta_features = games_df[['developer_encoded', 'review_encoded', 'price']].copy()
scaler = MinMaxScaler()
meta_scaled = scaler.fit_transform(meta_features)

### Масштабируем векторы Doc2Vec

In [9]:
doc_vectors_scaled = MinMaxScaler().fit_transform(doc_vectors)

## Гибридная модель:

In [31]:
alpha = 0.7
beta = 0.3

# Масштабируем doc_vectors, чтобы они были сопоставимы по масштабам с мета-признаками
doc_vectors_scaled = MinMaxScaler().fit_transform(doc_vectors)

# Объединяем фичи
combined_features = hstack([
    alpha * doc_vectors_scaled,
    beta * meta_scaled
])

### Функция для рекомендаций

In [38]:
def recommend(game_title, top_n=10):
    idx = games_df[games_df['title'].str.lower() == game_title.lower()].index
    if idx.empty:
        return f"Игра '{game_title}' не найдена."
    
    idx = idx[0]
    target_vector = combined_features[idx].reshape(1, -1)
    similarities = cosine_similarity(target_vector, combined_features)[0]
    
    # Получить индексы наиболее похожих игр (без самой себя)
    similar_indices = similarities.argsort()[::-1][1:top_n+1]
    
    return games_df.iloc[similar_indices][['title', 'original_price', 'reviews_summary', 'developer']]

### Сохраняем Doc2Vec модель

In [65]:
doc2vec_model.save("models/doc2vec.model")

with open("models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

np.save("models/combined_features.npy", combined_features)

games_df.to_parquet("source/games_processed.parquet")