# 2110010114 李佳琪

## 基于知识推荐

练习运用MovieTweeting数据集构建基于约束的推荐系统，于11月30日前提交到BB平台。

假设有一个电影推荐系统，电影有属性标签，用户有兴趣标签

In [1]:
class KnowledgeBasedRecommender:
    def __init__(self, user_interests, movie_attributes):
        self.user_interests = set(user_interests)
        self.movie_attributes = movie_attributes

    def recommend_movies(self):
        recommended_movies = []
        for movie, attributes in self.movie_attributes.items():
            if self.user_interests.issubset(attributes):
                recommended_movies.append(movie)
        return recommended_movies

# 示例数据
user_interests = ["科幻", "动作"]
movie_attributes = {
    "电影1": ["科幻", "动作", "悬疑"],
    "电影2": ["科幻", "动作", "喜剧"],
    "电影3": ["悬疑", "惊悚"],
    "电影4": ["喜剧", "爱情"],
}

# 创建推荐系统
recommender = KnowledgeBasedRecommender(user_interests, movie_attributes)

# 获取推荐电影
recommendations = recommender.recommend_movies()

# 打印推荐结果
print("推荐电影:", recommendations)


推荐电影: ['电影1', '电影2']


### 数据预处理

#### 加载库 & 读取数据

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [3]:
movies = pd.read_csv('D:\\大三上\\推荐系统\\第三次作业\\RS with MovieTweeting\\movies.dat', delimiter='::', header=None, names=['movie_id', 'movie', 'genre'], dtype={'movie_id': object}, engine='python')
reviews = pd.read_csv('D:\\大三上\\推荐系统\\第三次作业\\RS with MovieTweeting\\ratings.dat', delimiter='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], dtype={'movie_id': object, 'user_id': object, 'timestamp': object}, engine='python')

In [4]:
# number of movies
print("The number of movies is {}.".format(movies.shape[0]))

# number of ratings
print("The number of ratings is {}.".format(reviews.shape[0]))

# unique users
print("The number of unique users is {}.".format(reviews.user_id.nunique()))

# missing ratings
print("The number of missing reviews is {}.".format(int(reviews.rating.isnull().mean()*reviews.shape[0])))

# the average, min, and max ratings given
print("The average, minimum, and max ratings given are {}, {}, and {}, respectively.".format(np.round(reviews.rating.mean(), 0), reviews.rating.min(), reviews.rating.max()))

The number of movies is 37342.
The number of ratings is 906831.
The number of unique users is 70783.
The number of missing reviews is 0.
The average, minimum, and max ratings given are 7.0, 0, and 10, respectively.


In [5]:
movies

Unnamed: 0,movie_id,movie,genre
0,0000008,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,0000010,La sortie des usines Lumière (1895),Documentary|Short
2,0000012,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race ...,
4,0000091,Le manoir du diable (1896),Short|Horror
...,...,...,...
37337,14499632,22 vs. Earth (2021),Animation|Short|Adventure
37338,14527836,Recalled (2021),Drama|Mystery|Thriller
37339,14544192,Bo Burnham: Inside (2021),Comedy|Drama|Music
37340,14735160,Mum is Pregnant (2021),


In [6]:
reviews

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,0114508,8,1381006850
1,2,0499549,9,1376753198
2,2,1305591,8,1376742507
3,2,1428538,1,1371307089
4,3,0075314,1,1595468524
...,...,...,...,...
906826,70781,9893250,10,1613857551
906827,70781,9898858,3,1585958452
906828,70782,0172495,10,1587107015
906829,70782,0414387,10,1587107852


In [7]:
# Number of genres
genres = []
for val in movies.genre:
    try:
        genres.extend(val.split('|'))
    except AttributeError:
        pass

genres = set(genres)
print("The number of genres is {}.".format(len(genres)))

The number of genres is 28.


In [8]:
genres

{'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western'}

#### 数据清洗


* movies

从标题中提取日期并创建新的特征；

为电影的每个世纪(1800年、1900年和2000年)，用0-1替换世纪的列

用1和0指示的每个genre的列

* reviews

timestamp转换为datetime

In [9]:
# 创建data
create_date = lambda val: val[-5:-1] if val[-1] == ')' else np.nan

# 从movie中提取出data
movies['date'] = movies['movie'].apply(create_date)

# 提取年
def add_movie_year(val):
    if val[:2] == yr:
        return 1
    else:
        return 0

    
for yr in ['18', '19', '20']:
    movies[str(yr) + "00's"] = movies['date'].apply(add_movie_year)

In [10]:
movies

Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's
0,0000008,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0
1,0000010,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0
2,0000012,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0
3,25,The Oxford and Cambridge University Boat Race ...,,1895,1,0,0
4,0000091,Le manoir du diable (1896),Short|Horror,1896,1,0,0
...,...,...,...,...,...,...,...
37337,14499632,22 vs. Earth (2021),Animation|Short|Adventure,2021,0,0,1
37338,14527836,Recalled (2021),Drama|Mystery|Thriller,2021,0,0,1
37339,14544192,Bo Burnham: Inside (2021),Comedy|Drama|Music,2021,0,0,1
37340,14735160,Mum is Pregnant (2021),,2021,0,0,1


In [11]:
# 拆开genres
def split_genres(val):
    try:
        if val.find(gene) >-1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0

# 每个genre分开
for gene in genres:        
    movies[gene] = movies['genre'].apply(split_genres)

In [12]:
movies

Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's,News,Crime,Adventure,...,Western,Musical,Family,History,Game-Show,Mystery,Reality-TV,Drama,Sci-Fi,Biography
0,0000008,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0000010,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0000012,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25,The Oxford and Cambridge University Boat Race ...,,1895,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0000091,Le manoir du diable (1896),Short|Horror,1896,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37337,14499632,22 vs. Earth (2021),Animation|Short|Adventure,2021,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
37338,14527836,Recalled (2021),Drama|Mystery|Thriller,2021,0,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0
37339,14544192,Bo Burnham: Inside (2021),Comedy|Drama|Music,2021,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
37340,14735160,Mum is Pregnant (2021),,2021,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
import datetime
change_timestamp = lambda val: datetime.datetime.fromtimestamp(int(val)).strftime('%Y-%m-%d %H:%M:%S')
reviews['date'] = reviews['timestamp'].apply(change_timestamp)

In [14]:
reviews

Unnamed: 0,user_id,movie_id,rating,timestamp,date
0,1,0114508,8,1381006850,2013-10-06 05:00:50
1,2,0499549,9,1376753198,2013-08-17 23:26:38
2,2,1305591,8,1376742507,2013-08-17 20:28:27
3,2,1428538,1,1371307089,2013-06-15 22:38:09
4,3,0075314,1,1595468524,2020-07-23 09:42:04
...,...,...,...,...,...
906826,70781,9893250,10,1613857551,2021-02-21 05:45:51
906827,70781,9898858,3,1585958452,2020-04-04 08:00:52
906828,70782,0172495,10,1587107015,2020-04-17 15:03:35
906829,70782,0414387,10,1587107852,2020-04-17 15:17:32


In [15]:
# 创建清洗后的dataset
reviews.to_csv('reviews_clean.csv')
movies.to_csv('movies_clean.csv')

### 推荐最受欢迎的电影

In [16]:
# 读取数据集
movies = pd.read_csv('movies_clean.csv')
reviews = pd.read_csv('reviews_clean.csv')
del movies['Unnamed: 0']
del reviews['Unnamed: 0']

* 平均评分越高，电影越好

* 一部电影至少有5个评分

* 如果电影的平均评分和评分次数相同，则由最近评分为标准

获取一个user_id并返回TOP-N推荐

在实现popular_recommendations函数的代码之前，先建立create_ranked_df的函数

这个辅助函数将电影和评论数据帧转换为ranked_movies数据帧，根据最高平均评分和时间排序，并有超过5个评分。

In [17]:
def create_ranked_df(movies, reviews):
        '''
        INPUT
        movies - the movies dataframe
        reviews - the reviews dataframe
        
        OUTPUT
        ranked_movies - a dataframe with movies that are sorted by highest avg rating, more reviews, 
                        then time, and must have more than 4 ratings
        '''
        
        # Pull the average ratings and number of ratings for each movie
        movie_ratings = reviews.groupby('movie_id')['rating']
        avg_ratings = movie_ratings.mean()
        num_ratings = movie_ratings.count()
        last_rating = pd.DataFrame(reviews.groupby('movie_id').max()['date']) # max get back date if recent rate
        last_rating.columns = ['last_rating']

        # Add Dates
        rating_count_df = pd.DataFrame({'avg_rating': avg_ratings, 'num_ratings': num_ratings})
        rating_count_df = rating_count_df.join(last_rating)

        # merge with the movies dataset
        movie_recs = movies.set_index('movie_id').join(rating_count_df)

        # sort by top avg rating and number of ratings
        ranked_movies = movie_recs.sort_values(['avg_rating', 'num_ratings', 'last_rating'], ascending=False)

        # for edge cases - subset the movie list to those with only 5 or more reviews
        ranked_movies = ranked_movies[ranked_movies['num_ratings'] >= 5]
        
        return ranked_movies

In [18]:
def popular_recommendations(user_id, n_top, ranked_movies):
    '''
    INPUT:
    user_id - the user_id (str) of the individual you are making recommendations for
    n_top - an integer of the number recommendations you want back
    ranked_movies - a pandas dataframe of the already ranked movies based on avg rating, count, and time

    OUTPUT:
    top_movies - a list of the n_top recommended movies by movie title in order best to worst
    '''

    top_movies = list(ranked_movies['movie'][:n_top])

    return top_movies

In [19]:
ranked_movies = create_ranked_df(movies, reviews) 

In [20]:
# user_id=1的用户推荐热度最高的20部电影
recs_20_for_1=popular_recommendations('1', 20, ranked_movies)
recs_20_for_1

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Selam (2013)',
 'Let There Be Light (2017)',
 "Quiet Riot: Well Now You're Here, There's No Way Back (2014)",
 'Crawl Bitch Crawl (2012)',
 'Skid Row Marathon (2017)',
 'Chasing Happiness (2019)',
 'Make Like a Dog (2015)',
 'Pandorica (2016)',
 'Third Contact (2011)',
 'Romeo Juliet (2009)',
 'Be Somebody (2016)',
 'Birlesen Gonuller (2014)',
 'Kitbull (2019)',
 'Agnelli (2017)',
 'Sátántangó (1994)',
 'Foster (2011)',
 'CM101MMXI Fundamentals (2013)']

In [21]:
# user_id=3988的用户推荐热度最高的15部电影
recs_15_for_3988=popular_recommendations('3988', 15, ranked_movies)
recs_15_for_3988

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Selam (2013)',
 'Let There Be Light (2017)',
 "Quiet Riot: Well Now You're Here, There's No Way Back (2014)",
 'Crawl Bitch Crawl (2012)',
 'Skid Row Marathon (2017)',
 'Chasing Happiness (2019)',
 'Make Like a Dog (2015)',
 'Pandorica (2016)',
 'Third Contact (2011)',
 'Romeo Juliet (2009)',
 'Be Somebody (2016)',
 'Birlesen Gonuller (2014)']

### 个性化推荐

添加参数，对电影的year和genre进行过滤

使用下面的单元格调整现有函数，允许年份和类型参数作为字符串列表。然后，你的结果将被过滤为仅在提供的年份和类型(作为或条件)列表内的电影。如果没有提供列表，则不应应用过滤器。

可根据需要调整其他输入，检索结果

In [22]:
def popular_recs_filtered(user_id, n_top, years, genres):
    
    # aggregation on rewiews dataframe.
    n_user_df = reviews.groupby('movie_id').agg({'rating':['mean', 'count'], 'date':['max']})
    n_user_df.columns = ['average_rating', 'count_review', 'last_review']
    n_user_df.sort_values(['average_rating', 'count_review', 'last_review'], ascending=False, inplace=True)
    
    # merge agg. with movies and deselect movies have reviews count less than 5.
    n_user_top_movies = n_user_df.merge(movies, how='left', on='movie_id')
    n_user_top_movies = n_user_top_movies.loc[n_user_top_movies['count_review']>=5]
    
    # get movies category for selected user.
    movies_type = reviews[reviews.user_id==user_id][['movie_id']].merge(movies, how='left', on='movie_id')
    columns = [*set("|".join([*movies_type['genre'].values]).split('|'))]
    
    # select sub rows that matched with years and genres attributes
    n_user_top_movies = n_user_top_movies[n_user_top_movies[genres].sum(axis=1).values >= 1]
    n_user_top_movies = n_user_top_movies[n_user_top_movies['date'].isin(years)]
    
    # select top n movies based on all attributes.
    top_movies = [*n_user_top_movies.movie.values[:n_top]]
    
    return top_movies

In [23]:
my_output1 = popular_recs_filtered(1, 20, years=[2015, 2016, 2017, 2018], genres=['History', 'Comedy'])
my_output1

['MSG 2 the Messenger (2015)',
 'Avengers: Age of Ultron Parody (2015)',
 'Five Minutes (2017)',
 'Make Like a Dog (2015)',
 'Be Somebody (2016)',
 'Kirik Party (2016)',
 'Poshter Girl (2016)',
 "Hillary's America: The Secret History of the Democratic Party (2016)",
 'I Believe in Miracles (2015)',
 'Bajrangi Bhaijaan (2015)',
 'Inner Workings (2016)',
 'Jim Jefferies: Freedumb (2016)',
 'A Date for Mad Mary (2016)',
 'O.J.: Made in America (2016)',
 'Coco (2017)',
 'Ayla: The Daughter of War (2017)',
 'Hacksaw Ridge (2016)',
 'They Shall Not Grow Old (2018)',
 'Green Book (2018)',
 'Boruto: Naruto the Movie (2015)']

In [24]:
my_output2 = popular_recs_filtered(3988, 15, years=[2020, 2021], genres=['Music', 'Musical', 'Crime'])
my_output2

['Hamilton (2020)',
 "Billie Eilish: The World's a Little Blurry (2021)",
 'Clouds (2020)',
 'Soul (2020)',
 'Bo Burnham: Inside (2021)',
 'Raat Akeli Hai (2020)',
 'Black Is King (2020)',
 'Operation Varsity Blues: The College Admissions Scandal (2021)',
 'Cruella (2021)',
 'Biggie: I Got a Story to Tell (2021)',
 'Athlete A (2020)',
 'Promising Young Woman (2020)',
 'Work It (2020)',
 'Bad Boys for Life (2020)',
 'The Devil All the Time (2020)']