# Movie Recommender System
- Author: Alexis
- Created: 2021/10/2
- Updated: 2022/2/27

The content-based recommendation system uses the TF-IDF vector of the movie metadata to represent the movie, calculates the cosine similarity of the vector as the movie similarity, sorts the similarity, and takes the top 10 movies as the recommendation result.

## Text Pre-processing
- 資料集 : movie_metadata.csv 共有 45,466筆

### Processing for Metadata

In [None]:
import pandas as pd

# 指定資料集檔案位置
PATH = '/content/drive/MyDrive/shared_folder/dataset/imdb/'

# 載入資料
df_metadata = pd.read_csv(PATH + 'movies_metadata.csv')

# 顯示資料數量
df_metadata.shape

  interactivity=interactivity, compiler=compiler, result=result)


(45466, 24)

In [None]:
# 顯示資料欄位
print(df_metadata.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [None]:
# 顯示前5筆資料
df_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [None]:
# 取用部分欄位
df_metadata = df_metadata[['genres','id','overview','status','original_title']]
df_metadata.head()

Unnamed: 0,genres,id,overview,status,original_title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",Released,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,Released,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,Released,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",Released,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,Released,Father of the Bride Part II


In [None]:
# 查看status欄位的所有不重複值
df_metadata.status.unique()

# 取出status欄位為Released的資料(row)
df_metadata = df_metadata[df_metadata['status'].isin(['Released'])]
df_metadata = df_metadata[df_metadata['status']=='Released']
df_metadata

Unnamed: 0,genres,id,overview,status,original_title
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",Released,Toy Story
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,Released,Jumanji
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,Released,Grumpier Old Men
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"Cheated on, mistreated and stepped on, the wom...",Released,Waiting to Exhale
4,"[{'id': 35, 'name': 'Comedy'}]",11862,Just when George Banks has recovered from his ...,Released,Father of the Bride Part II
...,...,...,...,...,...
45461,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",439050,Rising and falling between a man and woman.,Released,رگ خواب
45462,"[{'id': 18, 'name': 'Drama'}]",111109,An artist struggles to finish his work while a...,Released,Siglo ng Pagluluwal
45463,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,"When one of her hits goes wrong, a professiona...",Released,Betrayal
45464,[],227506,"In a small town live two brothers, one a minis...",Released,Satana likuyushchiy


In [None]:
# 處理NaN (NaN -> 空白)
df_metadata['overview'] = df_metadata['overview'].fillna('')
df_metadata['overview']

0        Led by Woody, Andy's toys live happily in his ...
1        When siblings Judy and Peter discover an encha...
2        A family wedding reignites the ancient feud be...
3        Cheated on, mistreated and stepped on, the wom...
4        Just when George Banks has recovered from his ...
                               ...                        
45461          Rising and falling between a man and woman.
45462    An artist struggles to finish his work while a...
45463    When one of her hits goes wrong, a professiona...
45464    In a small town live two brothers, one a minis...
45465    50 years after decriminalisation of homosexual...
Name: overview, Length: 45014, dtype: object

In [None]:
# 文字前處理
import nltk
nltk.download('wordnet')

import re
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    processed_text = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS:
            token = lemmatizer.lemmatize(token, pos='v')
            token = lemmatizer.lemmatize(token, pos='n')
            processed_text.append(token)
    return ' '.join(processed_text)

df_metadata['overview'] = df_metadata['overview'].astype('str')
df_metadata['overview'] = df_metadata['overview'].apply(preprocess)
df_metadata['overview'][0]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


'lead woody andy toy live happily room andy birthday bring buzz lightyear scene afraid lose place andy heart woody plot buzz circumstance separate buzz woody owner duo eventually learn aside difference'

In [None]:
# 查看第一筆資料的genres內容
df_metadata['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [None]:
# *** eval()是用來執行一個字串表達式(執行字串內容)
# *** literal_eval()則先判斷該表達式是否安全
from ast import literal_eval

# 取出name的值，並用空白連接起來
def get_names_from_json_string(a_string):
    a_list = literal_eval(a_string)
    combined_names_list = [a_dict['name'] for a_dict in a_list]
    combined_names_string = ' '.join(combined_names_list)
    return combined_names_string

# 以apply對df_metadata中的每一筆資料執行get_genres這個函示
df_metadata['genres'] = df_metadata['genres'].apply(get_names_from_json_string)
df_metadata['genres'][0]

'Animation Comedy Family'

### Processing for Credits

In [None]:
# 載入資料
df_credits = pd.read_csv(PATH + 'imdb_credits.csv')

# 顯示前5筆資料
df_credits.head(1)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862


In [None]:
# 顯示第一筆資料的cast的內容
literal_eval(df_credits['cast'][0])

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'},
 {'cast_id': 15,
  'character': 'Buzz Lightyear (voice)',
  'credit_id': '52fe4284c3a36847f8024f99',
  'gender': 2,
  'id': 12898,
  'name': 'Tim Allen',
  'order': 1,
  'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'},
 {'cast_id': 16,
  'character': 'Mr. Potato Head (voice)',
  'credit_id': '52fe4284c3a36847f8024f9d',
  'gender': 2,
  'id': 7167,
  'name': 'Don Rickles',
  'order': 2,
  'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'},
 {'cast_id': 17,
  'character': 'Slinky Dog (voice)',
  'credit_id': '52fe4284c3a36847f8024fa1',
  'gender': 2,
  'id': 12899,
  'name': 'Jim Varney',
  'order': 3,
  'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'},
 {'cast_id': 18,
  'character': 'Rex (voice)',
  'credit_id': '52fe4284c3a36847f8024fa5',
  'gender': 2,
  'id': 12900,
 

In [None]:
# 取出cast(所有演員姓名)，並以空白連接
df_credits['cast'] = df_credits['cast'].apply(get_names_from_json_string)

# 顯示第一筆資料的cast的內容
df_credits['cast'][0]

'Tom Hanks Tim Allen Don Rickles Jim Varney Wallace Shawn John Ratzenberger Annie Potts John Morris Erik von Detten Laurie Metcalf R. Lee Ermey Sarah Freeman Penn Jillette'

In [None]:
# 顯示第一筆資料的crew的內容
literal_eval(df_credits['crew'][0])

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f5b',
  'department': 'Writing',
  'gender': 2,
  'id': 12892,
  'job': 'Screenplay',
  'name': 'Joel Cohen',
  'profile_path': '/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f61',
  'department': 'Writing',
  'gender': 0,
  'id': 12893,
  'job': 'Screenplay',
  'name': 'Alec Sokolow',
  'profile_path': '/v79vlRYi94BZUQnkkyzn

In [None]:
# 從crew欄位取出job為director的導演姓名，並以空白連接

def get_director_name_from_json_string(string_):
    list_ = literal_eval(string_)
    for dict_ in list_:
        if dict_['job'] == 'Director':
            return dict_['name']
    return ''

df_credits['director'] = df_credits['crew'].apply(get_director_name_from_json_string)

# 刪除crew欄位
del df_credits['crew']

# 顯示第一筆資料的cast的內容
df_credits['director'][0]

'John Lasseter'

### Processing for Keywords

In [None]:
# 載入資料
df_keywords = pd.read_csv(PATH + 'imdb_keywords.csv')

# 顯示第一筆資料
df_keywords.head(1)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."


In [None]:
# 顯示第一筆資料的keywords
literal_eval(df_keywords['keywords'][0])

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [None]:
# 取出keywords (以apply對df_keywords中的每一筆資料執行get_names_from_json_string這個函示)
df_keywords['keywords'] = df_keywords['keywords'].apply(get_names_from_json_string)

# 顯示第一筆資料的keywords內容
df_keywords['keywords'][0]

'jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life'

### Merge DataFrames

In [None]:
# 將id欄位轉為數值型態
df_metadata['id'] = df_metadata['id'].astype('str')
df_credits['id'] = df_credits['id'].astype('str')
df_keywords['id'] = df_keywords['id'].astype('str')

# 合併三個資料表
df_data = df_metadata.merge(df_credits, on='id').drop_duplicates()
df_data = df_data.merge(df_keywords, on='id').drop_duplicates()

# 顯示第一筆資料
df_data.head(1)

Unnamed: 0,genres,id,overview,status,original_title,cast,director,keywords
0,Animation Comedy Family,862,lead woody andy toy live happily room andy bir...,Released,Toy Story,Tom Hanks Tim Allen Don Rickles Jim Varney Wal...,John Lasseter,jealousy toy boy friendship friends rivalry bo...


In [None]:
df_data.shape

(44985, 8)

In [None]:
# 移除有nan的資料
df_data = df_data.dropna()
df_data.shape

(44985, 8)

In [None]:
# 把6個欄位合併成一個字串，中間以空白隔開
def combine_columns(row):
    txt = row['genres'] + ' ' + row['overview'] + ' ' + row['original_title'] + ' ' + row['cast'] + ' ' + row['director'] + ' ' + row['keywords']
    return txt.lower()

df_data['combined_txt'] = df_data.apply(combine_columns, axis=1)

# 顯示前五筆combined_txt
df_data[['combined_txt']].head()

Unnamed: 0,combined_txt
0,animation comedy family lead woody andy toy li...
1,adventure fantasy family sibling judy peter di...
2,romance comedy family wed reignite ancient feu...
3,comedy drama romance cheat mistreat step woman...
4,comedy george bank recover daughter wed receiv...


In [None]:
#import pickle

#with open('/content/drive/MyDrive/Colab Notebooks/推薦系統/movie_texts.pkl','wb') as f:
#    pickle.dump(df_data, f)

#with open('/content/drive/MyDrive/Colab Notebooks/推薦系統/movie_texts.pkl','rb') as f:
#    df_data = pickle.load(f)
#    print(df_data.shape)

#df_data

## Vector Represtation of Documents

In [None]:
# 產生TF-IDF分數

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

#傳入fit_transform的資料格式可以是dataframe的series，也可以是python的list
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_data['combined_txt'])

# 顯示資料筆數與詞典的詞總數(詞典大小)
tfidf_matrix.shape

# 顯示詞典中詞的總數(詞典尺寸)
print(len(tfidf.get_feature_names()))

# 顯示詞典中第1001~1010個詞
print(tfidf.get_feature_names()[:10])

173219
['00', '000', '002', '008', '009', '009ノ1', '01', '02', '03', '04']


### Similarities of Documents

In [None]:
# 產生編號與電影名稱對應表
# *** 原本的索引作為新陣列的值，電影標題作為新陣列的索引 ***

import pandas as pd

df_data = df_data.reset_index()
indices = pd.Series(df_data.index, index=df_data['original_title'])

In [None]:
# https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
'''
# 搜尋電影名稱
import numpy as np
foo = df_metadata
foo[foo.original_title.str.contains('Matrix', regex= True, na=False)]
'''

#movie_title = 'Toy Story'
movie_title = 'The Matrix'

# 取出該電影的編號(索引)
movie_idx = indices[movie_title]
print(movie_idx)

# 取出該電影的tfidf向量
movie_vector = tfidf_matrix[movie_idx]
#movie_vector = movie_vector.reshape(-1,1)

2445


In [None]:
# 計算相似度(得到一個矩陣)

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(movie_vector,tfidf_matrix)
print(similarity.shape)

similarity = similarity.tolist()[0]

(1, 44985)


### Ranking of Documents

In [None]:
# 取出該電影與其他電影的相似度分數，並與電影編號組成tuple
# (enumerate可以吃list或是numpy array)
sim_scores = list(enumerate(similarity))

# 將相似度分數進行排序
sorted_sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

# 取出相似度最高之前十部電影
top_sim_scores = sorted_sim_scores[1:11]

# 取出該十部電影之編號
movie_indices = [i[0] for i in top_sim_scores]
print(movie_indices)

# 取出電影名稱
recommend_movies = [indices[indices == i].index[0] for i in movie_indices]
recommend_movies

[6747, 6189, 43432, 43720, 9324, 43714, 6481, 43716, 20574, 4221]


['The Matrix Revolutions',
 'The Matrix Reloaded',
 'The Matrix Revisited',
 "Kid's Story",
 'The Animatrix',
 'A Detective Story',
 'Commando',
 'World Record',
 'Underground: The Julian Assange Story',
 'A.I. Artificial Intelligence']