In [3]:
import pandas as pd

In [4]:
movies = pd.read_csv('D:/ml-latest/ml-latest/movies.csv')

In [5]:
genome_scores = pd.read_csv('D:/ml-latest/ml-latest/genome-scores.csv')
genome_tags = pd.read_csv('D:/ml-latest/ml-latest/genome-tags.csv')
ratings = pd.read_csv('D:/ml-latest/ml-latest/ratings.csv')
tags = pd.read_csv('D:/ml-latest/ml-latest/tags.csv')

In [6]:
merged_genome = genome_scores.merge(genome_tags, on='tagId', how='left')

In [7]:
merged_movies = movies.merge(merged_genome, on='movieId', how='left')

In [8]:
merged_movies.head(10)

Unnamed: 0,movieId,title,genres,tagId,relevance,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,0.032,007
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,0.02225,007 (series)
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,0.07,18th century
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,0.059,1920s
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,0.123,1930s
5,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,0.131,1950s
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,0.06175,1960s
7,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,0.1955,1970s
8,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9.0,0.26625,1980s
9,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,0.033,19th century


In [9]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2008-11-03 17:52:19
1,1,110,4.0,2008-11-05 06:04:46
2,1,158,4.0,2008-11-03 17:31:43
3,1,260,4.5,2008-11-03 18:00:04
4,1,356,5.0,2008-11-03 17:58:39


In [11]:
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')

In [12]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,2015-05-03 15:22:38
1,10,260,Harrison Ford,2015-05-03 15:21:45
2,10,260,sci-fi,2015-05-03 15:22:18
3,14,1221,Al Pacino,2011-07-25 13:32:36
4,14,1221,mafia,2011-07-25 13:32:26


Обработка данных

Задача 1

In [13]:
# Агрегируем данные по фильмам: считаем среднюю оценку и количество оценок для каждого фильма
agg_ratings = ratings.groupby('movieId').agg({
    'rating': ['mean', 'count']
})
agg_ratings.columns = ['vote_average', 'vote_count']  # Переименовываем столбцы для удобства
agg_ratings.reset_index(inplace=True)

In [29]:
movies_with_ratings = movies.merge(agg_ratings, on='movieId')

In [30]:
C = movies_with_ratings['vote_average'].mean()

In [38]:
m = movies_with_ratings['vote_count'].quantile(0.90)

In [45]:
def has_sequel(title):
    sequel_keywords = ["II", "III", "Part", "Returns", "Revenge", "Resurrection", "Final", "Chapter"]
    return any(keyword in title for keyword in sequel_keywords)

In [41]:
# Фильтруем фильмы, у которых количество оценок больше порогового значения m
qualified_movies = movies_with_ratings[movies_with_ratings['vote_count'] >= m].copy()

In [46]:
#Убираем сиквелы
qualified_movies = qualified_movies[~qualified_movies['title'].apply(has_sequel)]

In [25]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [47]:
# Применяем нашу функцию к каждой строке
qualified_movies['score'] = qualified_movies.apply(weighted_rating, axis=1)

# Сортируем фильмы по рассчитанному взвешенному рейтингу
qualified_movies = qualified_movies.sort_values('score', ascending=False)

# Выводим топ-10 фильмов с их названием, средней оценкой, количеством оценок и итоговым рейтингом
print(qualified_movies[['title', 'vote_average', 'vote_count', 'score', 'genres']].head(10))

                                                  title  vote_average  \
314                    Shawshank Redemption, The (1994)      4.416792   
49                           Usual Suspects, The (1995)      4.267865   
522                             Schindler's List (1993)      4.242337   
2867                                  Fight Club (1999)      4.236019   
292                                 Pulp Fiction (1994)      4.191778   
1164             One Flew Over the Cuckoo's Nest (1975)      4.212801   
2480                                 Matrix, The (1999)      4.160631   
585                    Silence of the Lambs, The (1991)      4.150287   
1182                                  Goodfellas (1990)      4.191918   
734   Dr. Strangelove or: How I Learned to Stop Worr...      4.199991   

      vote_count     score                       genres  
314       122296  4.374471                  Crime|Drama  
49         72893  4.207519       Crime|Mystery|Thriller  
522        84232  4.19

Shawshank Redemption, The (1994)      4.416792      122296   
840                     Godfather, The (1972)      4.326603       75004   
49                 Usual Suspects, The (1995)      4.267865       72893   
522                   Schindler's List (1993)      4.242337       84232   
2867                        Fight Club (1999)      4.236019       86207   
1190           Godfather: Part II, The (1974)      4.269510       47271   
292                       Pulp Fiction (1994)      4.191778      108756   
1164   One Flew Over the Cuckoo's Nest (1975)      4.212801       49316   
12221                 Dark Knight, The (2008)      4.187539       65349   
2480                       Matrix, The (1999)      4.160631      107056   

Задача 2

In [None]:
filtered_movies = merged_movies[merged_movies['relevance'] >= 0.6]

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['tag'])