# Hệ gợi ý phim

## 0. Setup

In [74]:
# /*==========================================================================================*\
# **                        _           _ _   _     _  _         _                            **
# **                       | |__  _   _/ | |_| |__ | || |  _ __ | |__                         **
# **                       | '_ \| | | | | __| '_ \| || |_| '_ \| '_ \                        **
# **                       | |_) | |_| | | |_| | | |__   _| | | | | | |                       **
# **                       |_.__/ \__,_|_|\__|_| |_|  |_| |_| |_|_| |_|                       **
# \*==========================================================================================*/

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

## 1. Load Dataset

In [75]:
df  = pd.read_csv('movies_metadata.csv')
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## 2. Hệ gợi ý đơn giản

In [76]:
def SimpleRcmdSystem(data, genres = "", percentile = 0.95, custom_format = False, output_format = []):
    if(not custom_format):
        data['genres']  = data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
        if(genres == ""):
            output_format = ['title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres'];
        else:
            output_format = ['title', 'year', 'vote_count', 'vote_average', 'popularity'];
            s = data.apply(
                lambda x: pd.Series(x['genres']),axis=1
            ).stack().reset_index(level=1, drop=True)
            s.name = 'genre'
            gen_data = data.drop('genres', axis=1).join(s)
            data = gen_data[gen_data['genre'] == genres];
        
        data['year'] = pd.to_datetime(
            data['release_date'], errors='coerce'
        ).apply(
            lambda x: str(x).split('-')[0] if x != np.nan else np.nan
        )


    vote_counts   = data[data['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = data[data['vote_average'].notnull()]['vote_average'].astype('int')

    C = vote_averages.mean()
    m = vote_counts.quantile(percentile)
     
    qualified = data[
        (data['vote_count'] >= m) & 
        (data['vote_count'].notnull()) & 
        (data['vote_average'].notnull())
    ][
        output_format
    ]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')

    # print('Qualified Shape:', qualified.shape)
    
    def weighted_rating(x):
        v = x['vote_count']
        R = x['vote_average']
        return (v/(v+m) * R) + (m/(m+v) * C)

    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    return qualified



In [77]:
SimpleRcmdSystem(data=df.copy(), percentile=0.95).head(30)

Unnamed: 0,title,year,vote_count,vote_average,popularity,genres,wr
15480,Inception,2010,14075,8,29.108149,"[Action, Thriller, Science Fiction, Mystery, A...",7.917588
12481,The Dark Knight,2008,12269,8,123.167259,"[Drama, Action, Crime, Thriller]",7.905871
22879,Interstellar,2014,11187,8,32.213481,"[Adventure, Drama, Science Fiction]",7.897107
2843,Fight Club,1999,9678,8,63.869599,[Drama],7.881753
4863,The Lord of the Rings: The Fellowship of the Ring,2001,8892,8,32.070725,"[Adventure, Fantasy, Action]",7.871787
292,Pulp Fiction,1994,8670,8,140.950236,"[Thriller, Crime]",7.86866
314,The Shawshank Redemption,1994,8358,8,51.645403,"[Drama, Crime]",7.864
7000,The Lord of the Rings: The Return of the King,2003,8226,8,29.324358,"[Adventure, Fantasy, Action]",7.861927
351,Forrest Gump,1994,8147,8,48.307194,"[Comedy, Drama, Romance]",7.860656
5814,The Lord of the Rings: The Two Towers,2002,7641,8,29.423537,"[Adventure, Fantasy, Action]",7.851924


In [78]:
SimpleRcmdSystem(data=df.copy(), genres='Animation', percentile=0.85).head(30)

Unnamed: 0,title,year,vote_count,vote_average,popularity,wr
359,The Lion King,1994,5520,8,21.605761,7.909339
5481,Spirited Away,2001,3968,8,41.048867,7.875933
9698,Howl's Moving Castle,2004,2049,8,16.136048,7.772103
2884,Princess Mononoke,1997,2041,8,17.166725,7.771305
5833,My Neighbor Totoro,1988,1730,8,13.507299,7.735274
40251,Your Name.,2016,1030,8,34.461252,7.58982
5553,Grave of the Fireflies,1988,974,8,0.010902,7.570962
19901,Paperman,2012,734,8,7.198633,7.465676
39386,Piper,2016,487,8,11.243161,7.285132
20779,Wolf Children,2012,483,8,10.249498,7.281198


## 3. Content Based Recommender

### 3.0. Tổng hợp dữ liệu

In [79]:
links_small = pd.read_csv('links_small.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

df = df.drop([19730, 29503, 35587])
df['id'] = df['id'].astype('int')

df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(
    lambda x: [i['name'] for i in x] if isinstance(x, list) else []
)
sdf = df[df['id'].isin(links_small)]


def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

id_map = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(sdf[['title', 'id']], on='id').set_index('title')
indices_map = id_map.set_index('id')

### 3.1. Đánh giá theo tổng quan phim

In [80]:
def DescriptionBasedTrainer(data):
    data['tagline'] = data['tagline'].fillna('')
    data['description'] = data['overview'] + data['tagline']
    data['description'] = data['description'].fillna('')
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(data['description'])
    tfidf_matrix.shape
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    data = data.reset_index()
    titles = data['title']
    indices = pd.Series(data.index, index=data['title'])
    return (cosine_sim, titles, indices)


### 3.2. Đánh giá theo chi tiết phim

In [81]:
def MetadataBasedTrainer(data):
    data['cast'] = data['cast'].apply(literal_eval)
    data['crew'] = data['crew'].apply(literal_eval)
    data['keywords'] = data['keywords'].apply(literal_eval)
    data['cast_size'] = data['cast'].apply(lambda x: len(x))
    data['crew_size'] = data['crew'].apply(lambda x: len(x))

    def get_director(x):
        for i in x:
            if(i['job'] == 'Director'):
                return i['name']
        return np.nan

    # Director
    data['director'] = data['crew'].apply(get_director)
    data['director'] = data['director'].astype('str').apply(
        lambda x: str.lower(x.replace(" ", ""))
    )
    data['director'] = data['director'].apply(lambda x: [x, x, x])

    # Cast
    data['cast']     = data['cast'].apply(
        lambda x: [i['name'] for i in x] if isinstance(x, list) else []
    )
    data['cast']     = data['cast'].apply(
        lambda x: x[:3] if(len(x) >= 3) else x
    )
    data['cast']     = data['cast'].apply(
        lambda x: [str.lower(i.replace(" ", "")) for i in x]
    )

    # Keyword
    data['keywords'] = data['keywords'].apply(
        lambda x: [i['name'] for i in x] if isinstance(x, list) else []
    )
    s = data.apply(
        lambda x: pd.Series(x['keywords']),axis=1
    ).stack().reset_index(level=1, drop=True)
    s.name = 'keyword'
    s = s.value_counts()
    s = s[s > 1]
    stemmer = SnowballStemmer('english')
    def filter_keywords(x):
        words = []
        for i in x:
            if i in s:
                words.append(i)
        return words

    data['keywords'] = data['keywords'].apply(filter_keywords)
    data['keywords'] = data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
    data['keywords'] = data['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])


    # Agg. Data
    data['soup'] = data['keywords'] + data['cast'] + data['director'] + data['genres']
    data['soup'] = data['soup'].apply(lambda x: ' '.join(x))

    # Training
    count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(data['soup'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    data = data.reset_index()
    titles = data['title']
    indices = pd.Series(data.index, index=data['title'])
    return (cosine_sim, titles, indices)

### 3.3. Dự đoán

In [82]:
def ContentBasedRcmdSystem__Predict(title, cosine_sim, titles, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]


def ContentBasedRcmdSystem__Predict(refdata, title, cosine_sim, titles, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]

    fmt = ['title', 'vote_count', 'vote_average']
    movies = refdata.iloc[movie_indices][fmt]
    return movies


def ContentBasedRcmdSystem__ImprovedPredict(refdata, title, cosine_sim, titles, indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]

    fmt = ['title', 'vote_count', 'vote_average']
    movies = refdata.iloc[movie_indices][fmt]
    return SimpleRcmdSystem(data = movies, percentile = 0.6, custom_format = True, output_format = fmt);



In [83]:
(cosine_sim, titles, indices) = DescriptionBasedTrainer(sdf.copy());

In [84]:
ContentBasedRcmdSystem__Predict(sdf, 'The Little Mermaid', cosine_sim, titles, indices).head(10)

Unnamed: 0,title,vote_count,vote_average
5747,Mysterious Island,38.0,6.2
1957,Flight of the Navigator,227.0,6.9
1536,George of the Jungle,508.0,5.4
2011,Splash,328.0,6.1
12722,The Chronicles of Narnia: Prince Caspian,1668.0,6.3
552,The Pagemaster,178.0,6.2
9518,Lost Embrace,10.0,5.9
1024,Alice in Wonderland,1557.0,7.0
7352,Laws of Attraction,104.0,5.6
3712,Sinbad and the Eye of the Tiger,39.0,6.3


In [85]:
ContentBasedRcmdSystem__ImprovedPredict(sdf, 'The Little Mermaid', cosine_sim, titles, indices).head(10)

Unnamed: 0,title,vote_count,vote_average,wr
25245,Song of the Sea,420,8,7.23112
1024,Alice in Wonderland,1557,7,6.851711
2562,The Rocky Horror Picture Show,703,7,6.717788
1194,The Big Blue,364,7,6.560073
1924,The Poseidon Adventure,274,7,6.48343
19329,Ice Age: Continental Drift,2731,6,5.991842
12722,The Chronicles of Narnia: Prince Caspian,1668,6,5.987302
2011,Splash,328,6,5.957483
7291,The Prince & Me,286,5,5.413053
1536,George of the Jungle,508,5,5.290887


In [86]:
(cosine_sim, titles, indices) = MetadataBasedTrainer(sdf.copy());

In [87]:
ContentBasedRcmdSystem__Predict(sdf, 'The Little Mermaid', cosine_sim, titles, indices).head(10)

Unnamed: 0,title,vote_count,vote_average
1959,The Great Mouse Detective,334.0,6.9
1515,Hercules,1741.0,7.3
14668,The Princess and the Frog,1293.0,6.7
581,Aladdin,3495.0,7.4
5806,Treasure Planet,980.0,7.2
13409,Ponyo,953.0,7.5
10409,Valiant,239.0,5.2
588,Beauty and the Beast,3029.0,7.5
16307,Tangled,3419.0,7.4
2007,Sleeping Beauty,1332.0,6.8


In [88]:
ContentBasedRcmdSystem__ImprovedPredict(sdf, 'The Little Mermaid', cosine_sim, titles, indices).head(10)

Unnamed: 0,title,vote_count,vote_average,wr
12812,WALL·E,6439,7,6.895583
21302,Monsters University,3622,7,6.83453
581,Aladdin,3495,7,6.83005
16307,Tangled,3419,7,6.827251
588,Beauty and the Beast,3029,7,6.811305
1818,Mulan,2089,7,6.757306
1515,Hercules,1741,7,6.728548
4272,Atlantis: The Lost Empire,1257,6,6.162508
14668,The Princess and the Frog,1293,6,6.160158
2007,Sleeping Beauty,1332,6,6.157687


## 4. Collaborative Filtering

In [89]:
ratings = pd.read_csv('ratings_small.csv')

def CollaborativeFilteringTrainer(ratings):
    reader = Reader()
    ratings.head()
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    svd = SVD()
    cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    return svd

def MoviePredict(model, user_id, film_id):
    return model.predict(user_id, film_id);


SVD = CollaborativeFilteringTrainer(ratings);


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9008  0.8962  0.8940  0.8931  0.8988  0.8966  0.0029  
MAE (testset)     0.6914  0.6904  0.6861  0.6900  0.6927  0.6901  0.0022  
Fit time          3.72    3.80    3.82    3.68    3.77    3.76    0.05    
Test time         0.12    0.26    0.12    0.12    0.12    0.15    0.05    


In [90]:
MoviePredict(SVD, 1, 302)

Prediction(uid=1, iid=302, r_ui=None, est=2.891522695165438, details={'was_impossible': False})

## 5. Hybrid System

In [101]:
def HybridRcmdSystem__Predict(userId, title, refdata, cfmodel):
    idx = indices[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    fmt = ['title', 'vote_count', 'vote_average', 'id']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = refdata.iloc[movie_indices][fmt]
    movies['est'] = movies['id'].apply(lambda x: cfmodel.predict(userId, indices_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)

    

    return movies.head(10)
    # return SimpleRcmdSystem(data = movies, percentile = 0.6, custom_format = True, output_format = fmt)

In [107]:
HybridRcmdSystem__Predict(1, 'The Little Mermaid', sdf, SVD)


Unnamed: 0,title,vote_count,vote_average,id,est
23076,Interstellar,11187.0,8.1,157336,3.371335
4126,Memento,4168.0,8.1,77,3.344043
15651,Inception,14075.0,8.1,27205,3.255442
11463,The Prestige,4510.0,8.0,1124,3.201394
14151,Gangster's Paradise: Jerusalema,16.0,6.8,22600,2.927666
7111,The Enforcer,21.0,7.4,26712,2.915643
9212,Thursday,84.0,7.0,9812,2.783937
10210,Batman Begins,7511.0,7.5,272,2.726594
15083,Harry Brown,351.0,6.7,25941,2.726398
18225,Batman: Year One,255.0,7.1,69735,2.69993


In [109]:
HybridRcmdSystem__Predict(500, 'The Little Mermaid', sdf, SVD)


Unnamed: 0,title,vote_count,vote_average,id,est
3336,Creature Comforts,29.0,7.3,54825,3.477688
13237,Bolt,1798.0,6.3,13053,3.357056
13409,Ponyo,953.0,7.5,12429,3.300386
1818,Mulan,2089.0,7.6,10674,3.288261
2944,Robin Hood,1155.0,7.0,11886,3.086891
1515,Hercules,1741.0,7.3,11970,3.084802
16307,Tangled,3419.0,7.4,38757,3.050919
581,Aladdin,3495.0,7.4,812,3.048169
588,Beauty and the Beast,3029.0,7.5,10020,2.981721
19433,Dragon Ball GT: A Hero's Legacy,55.0,6.2,18095,2.943866


In [110]:
HybridRcmdSystem__Predict(600, 'The Little Mermaid', sdf, SVD)

Unnamed: 0,title,vote_count,vote_average,id,est
16307,Tangled,3419.0,7.4,38757,4.117981
581,Aladdin,3495.0,7.4,812,4.114972
13409,Ponyo,953.0,7.5,12429,3.935455
588,Beauty and the Beast,3029.0,7.5,10020,3.935314
3336,Creature Comforts,29.0,7.3,54825,3.882443
2944,Robin Hood,1155.0,7.0,11886,3.872352
1515,Hercules,1741.0,7.3,11970,3.741375
5806,Treasure Planet,980.0,7.2,9016,3.736415
1959,The Great Mouse Detective,334.0,6.9,9994,3.736014
5362,Hey Arnold! The Movie,62.0,5.6,17710,3.717295
