Загрузить предобработанные данные по item и user (по отдельности и вместе) и через генерацию промпта получить результат в виде:
а) пользователей со схожими интересами
б) рекомендованные фильмы

perplexity ai api

In [1]:
import pandas as pd
import ast

In [22]:
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [28]:
!pip install openai

Collecting openai
  Downloading openai-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting jiter<1,>=0.10.0 (from openai)
  Downloading jiter-0.13.0-cp313-cp313-win_amd64.whl.metadata (5.3 kB)
Downloading openai-2.17.0-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   --------- ------------------------------ 0.3/1.1 MB ? eta -:--:--
   ---------------------------------------- 1.1/1.1 MB 3.7 MB/s  0:00:00
Downloading jiter-0.13.0-cp313-cp313-win_amd64.whl (202 kB)
Installing collected packages: jiter, openai

   ---------------------------------------- 0/2 [jiter]
   -------------------- ------------------- 1/2 [openai]
   -------------------- ------------------- 1/2 [openai]
   -------------------- ------------------- 1/2 [openai]
   -------------------- ------------------- 1/2 [openai]
   -------------------- ------------------- 1/2 [openai]
   -------------------- ------------------- 1/2 [openai]
   -------------------- ----------------

In [29]:
from openai import OpenAI
import json
import os
from dotenv import load_dotenv
import re

In [2]:
ratings = pd.read_csv('../data/The Movies Dataset/ratings_small.csv')

movies = pd.read_csv('../data/The Movies Dataset/movies_metadata.csv', low_memory=False)
keywords = pd.read_csv('../data/The Movies Dataset/keywords.csv')
credits = pd.read_csv('../data/The Movies Dataset/credits.csv')

links = pd.read_csv('../data/The Movies Dataset/links_small.csv')

In [3]:
# Удаляем повторяшки
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)

In [4]:
movies = movies.drop_duplicates()
links = links.drop_duplicates()
ratings = ratings.drop_duplicates()

In [5]:
# Добавляем tmdbI в ratings через файл links (id в movies_metadata)
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)

In [6]:
ratings = ratings[['userId', 'movieId', 'rating', 'timestamp']].merge(links[['tmdbId', 'movieId']], on='movieId', how='left')
ratings = ratings.dropna(subset=['tmdbId'])

In [7]:
ratings['tmdbId'] = ratings['tmdbId'].astype(int)

In [8]:
# Обработка жанров (нам нужны значения, а не целая запись словаря)
def parse_json_field(text, key='name'):
    try:
        items = ast.literal_eval(text)
        return [item[key] for item in items if key in item]
    except (ValueError, SystemError):
        return []

In [9]:
movies['genres_list'] = movies['genres'].apply(lambda x: parse_json_field(x))
movies['genres_str'] = movies['genres_list'].apply(lambda x: ', '.join(x) if x else 'Unknown')

In [10]:
# Также добавляем credits и keywords в movies_metadata
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
credits = credits.dropna(subset=['id'])
credits['id'] = credits['id'].astype(int)

keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')
keywords = keywords.dropna(subset=['id'])
keywords['id'] = keywords['id'].astype(int)

In [11]:
# Важная фича для рекомендашки - режиссёр, но нам опять же нужно только значение (имя, фамилия)
def get_director(crew_str):
    try:
        crew = ast.literal_eval(crew_str)
        return next((m['name'] for m in crew if m['job'] == 'Director'), 'Unknown')
    except:
        return 'Unknown'
    
credits['director'] = credits['crew'].apply(get_director)

In [12]:
keywords['keywords_str'] = keywords['keywords'].apply(
    lambda x: ', '.join(parse_json_field(x))
)

In [13]:
# Актёрский каст
def get_top_cast(cast_str, n=3):
    try:
        cast = ast.literal_eval(cast_str)
        return ', '.join((c['name'] for c in cast[:n]))
    except:
        return 'Unknown'
    
credits['cast_str'] = credits['cast'].apply(get_top_cast)

In [14]:
# Добавляем в фильмы инфу о режиссёре, касте и ключевые слова
movies = movies.merge(credits[['id', 'director', 'cast_str']], on='id', how='left')
movies = movies.merge(keywords[['id', 'keywords_str']], on='id', how='left')

In [15]:
# Обработка пропусков
movies['overview'] = movies['overview'].fillna('No description available')
movies['director'] = movies['director'].fillna('Unknown')
movies['cast_str'] = movies['cast_str'].fillna('Unknown')
movies['keywords_str'] = movies['keywords_str'].fillna('')
movies['vote_count'] = movies['vote_count'].fillna(0)
movies['vote_average'] = movies['vote_average'].fillna(0)

In [16]:
# Нам не нужна вся дата, значение может иметь только год
movies['year'] = pd.to_datetime(movies['release_date'], errors='coerce').dt.year
movies['year'] = movies['year'].fillna(0).astype(int)

In [17]:
# Создаём текстовое описание для фильма
def build_item_profile(row):
    parts = [
        f'"{row['title']}" ({row['year']})',
        f'Genres: {row['genres_str']}',
        f'Director: {row['director']}',
        f'Cast: {row['cast_str']}',
    ]
    if row['keywords_str']:
        parts.append(f'Keywords: {row['keywords_str']}')
    parts.append(f'Plot: {row['overview'][:200]}')
    parts.append(f'Ratings: {row['vote_average']}/10')

    return ' | '.join(parts)

In [18]:
movies['item_profile'] = movies.apply(build_item_profile, axis=1)

In [19]:
# Данные пользователя
user_data = ratings.merge(
    movies[['id', 'title', 'genres_str', 'director', 'year']],
    left_on='tmdbId', right_on='id', how='inner'
)

In [20]:
def build_user_profile(user_id, group):
    # Нас больше интересует недавняя активность
    group = group.sort_values('timestamp')

    # Последние 15 лайков (оценка 4 и выше)
    liked = group[group['rating'] >= 4.0].tail(15)
    # Последние 5 дизлайков
    disliked = group[group['rating'] <= 2.0].tail(5)

    # Достаём жанры из понравившихся фильмов
    all_genres = []
    for g in group[group['rating'] >= 4.0]['genres_str']:
        all_genres.extend(g.split(', '))
    from collections import Counter
    top_genres = [g for d, _ in Counter(all_genres).most_common(5)]

    # Достаём режиссёров
    top_director = group[group['rating'] >= 4.0]['director'].value_counts().head(3).index.tolist()

    profile = {
        'user_id': user_id,
        'avg_rating': round(group['rating'].mean(), 2),
        'total_rating': len(group),
        'top_genres': top_genres,
        'top_director': top_director,
        'liked_movies': liked[['title', 'year', 'rating']].to_dict('records'),
        'disliked_movies': disliked[['title', 'year', 'rating']].to_dict('records'),
    }

    liked_str = '; '.join(
        [f'{m['title']} ({m['rating']})' for m in profile['liked_movies']]
    )
    disliked_str = '; '.join(
        [f'{m['title']} ({m['rating']})' for m in profile['disliked_movies']]
    )

    profile['text_profile'] = (
        f'User {user_id}: '
        f'Favorite genres: {', '.join(top_genres)}'
        f'Favorite directors: {', '.join(top_director)}'
        f'Average rating: {profile['avg_rating']}'
        f'Recently liked: {liked_str}'
        f'Recently disliked: {disliked_str}'
    )

    return profile



In [21]:
user_profiles = {}
for uid, group in user_data.groupby('userId'):
    user_profiles[uid] = build_user_profile(uid, group)

In [23]:
# Поиск похожих пользователей
user_item_df = ratings.pivot(index='userId', columns='tmdbId', values='rating').fillna(0)
user_sim_matrix = cosine_similarity(user_item_df.values)

In [24]:
def get_similar_users(user_id, top_k=5):
   # косинусное расстояние
    idx = list(user_item_df.index).index(user_id)
    sims = user_sim_matrix[idx]
    top_idxs = np.argsort(sims)[::-1][1:top_k+1]
    return [(user_item_df.index[i], round(sims[i], 3)) for i in top_idxs]

In [25]:
# Теперь берём у похожего пользователя фильмы, чтобы посоветовать нашему
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['item_profile'].fillna(''))

def get_candidate_items(user_id, n_candidates=20):
    watched = set(ratings[ratings['userId'] == user_id]['tmdbId'])
    similar_users = get_similar_users(user_id, top_k=10)

    candidates = set()
    for sim_uid, _ in similar_users:
        their_liked = ratings[(ratings['userId'] == sim_uid) & 
                              ratings['rating'] >= 4.0]['tmdbId']
        candidates.update(their_liked.tolist())

    candidates -= watched

    candidates = list(candidates)[:n_candidates]
    return candidates

In [26]:
def build_recommendation_promt(target_user_id, candidate_movis_ids, similar_users):
    # Профиль нашего пользователя
    user_text = user_profiles[target_user_id]['text_profile']

    # Фильмы, которые могут быть ему интересны
    candidate_lines = []
    for i, mid in enumerate(candidate_movis_ids):
        row = movies[movies['id'] == mid].iloc[0]
        candidate_lines.append(f'{i+1}. P{row['item_profile']}')
    candidates_text = '\n'.join(candidate_lines)

    # Похожие ребята
    similar_users_lines = []
    for sim_uid, sim_score in similar_users:
        sim_text = user_profiles[sim_uid]['text_profile']
        similar_users_lines.append(
            f'- {sim_text} (similarity: {sim_score})'
        )
    similar_text = '\n'.join(similar_users_lines)

    # Формирование промпта на основе схожести и интересов
    promt = f"""
        You are a movie recommendation expert. Your task is to recommend 
        movies and find similar users based on viewing preferences.

        Target User: {user_text}

        Candidates_text: {candidates_text}

        Similar users (by rating patterns): {similar_text}

        Instruction:
        1. Analyze the target user's genre preferences, liked and disliked movies.
        2. From the CANDIDATE MOVIES list, select the TOP 10 movies that best match 
        this user's taste. For each movie, explain why it fits.
        3. From the SIMILAR USERS list, select the TOP 3 most similar users. 
        For each, explain what makes their taste similar to the target user.
        4. Note that the user's most recently watched movies are the most indicative 
        of their current preferences.

        Return ONLY valid JSON in this exact format:
        {{
        "recommended_movies": [
            {{"rank": 1, "title": "...", "reason": "..."}},
            ...
        ],
        "similar_users": [
            {{"user_id": ..., "similarity_reason": "..."}},
            ...
        ]
        }}
    """

    return promt

In [None]:
load_dotenv()
api_key = os.getenv("PERPLEXITY_API_KEY")

client = OpenAI(
    api_key=api_key,
    base_url="https://api.perplexity.ai"
)

def get_recommendations(user_id):
    similar_users = get_similar_users(user_id, top_k=5)
    candidate_ids = get_candidate_items(user_id, n_candidates=20)
    
    promt = build_recommendation_promt(user_id, candidate_ids, similar_users)

    # LLM
    response = client.chat.completions.create(
        model="sonar-pro",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a precise movie recommendation engine. "
                    "Use ONLY the data provided. Do NOT search the web."
                )
            },
            {"role": "user", "content": promt}
        ],
        temperature=0.2,
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "recommendations",
                "schema": {
                    "type": "object",
                    "properties": {
                        "recommended_movies": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "rank": {"type": "integer"},
                                    "title": {"type": "string"},
                                    "reason": {"type": "string"}
                                },
                                "required": ["rank", "title", "reason"]
                            }
                        },
                        "similar_users": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "user_id": {"type": "integer"},
                                    "similarity_reason": {"type": "string"}
                                },
                                "required": ["user_id", "similarity_reason"]
                            }
                        }
                    },
                    "required": ["recommended_movies", "similar_users"]
                }
            }
        }

    )
    
    # Парсим ответ в читабельный вид
    result = json.loads(response.choices[0].message.content)
    return result

In [None]:
# получение рекоммендаций
result = get_recommendations(user_id=20)

In [None]:
print("Рекомендуемые фильмы:")
for movie in result['recommended_movies']:
    print(f"  {movie['rank']}. {movie['title']} — {movie['reason']}")

print("Похожие пользователи:")
for user in result['similar_users']:
    print(f"  User {user['user_id']} — {user['similarity_reason']}")


Рекомендуемые фильмы:
  1. The Incredibles — Animated comedy-adventure similar to liked Kung Fu Panda 3, matching comedy genre preference and praised by similar users 509 and 461.
  2. Napoleon Dynamite — Quirky comedy liked by similar user 509 (4.0), aligns with strong comedy preference and recent likes like Team America: World Police.
  3. Little Miss Sunshine — Comedy-drama liked by similar user 388 (4.0), fits comedy focus and recent wedding comedy like Mike and Dave Need Wedding Dates.
  4. Step Brothers — Absurd comedy liked by similar user 388 (4.0), matches recent likes like Father of the Bride and The Man Who Knew Too Little.
  5. The Nice Guys — Comedy-crime film liked by similar user 73 (4.0), complements comedy genre and directors like Soderbergh/Fincher style.
  6. WALL·E — Animated film similar to Kung Fu Panda per search results, fits family comedy and liked animated Kung Fu Panda 3.
  7. Ratatouille — Animated comedy with animal protagonist, matches Kung Fu Panda style 