In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk("."):
    path = root.split(os.sep)
    print((len(path) - 1) * '---', os.path.basename(root))
    for file in files:
        print(len(path) * '---', file)

 .
--- Simple-Recomender-System.ipynb
--- README.md
--- datasets
------ movies_metadata.csv
------ keywords.csv
------ credits.csv
--- .ipynb_checkpoints
------ Simple-Recomender-System-checkpoint.ipynb


In [3]:
# kaggle datasets
metadt = pd.read_csv('./datasets/movies_metadata.csv', low_memory=False)
print("Raw data shape: ", metadt.shape)
# 5k data (if you only have low compute engine)
metadt = metadt[:5000]
print("Sample data shape: ", metadt.shape)
metadt.head(3)

Raw data shape:  (45466, 24)
Sample data shape:  (5000, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


# Recommender system using weighted rating

Penggunaan rating untuk rekomendasi memiliki kekurangan:  
* rating tidak memberikan gambaran popularitas produk. Produk A memilki rating 9.5 dari 20 voters sedangkan produk B memiliki rating 8.7 dari 1000 voters. Mana yang lebih baik? tentu saja produk B, rating yang diberikan oleh lebih banyak user lebih terpercaya dibandingkan rating tinggi dengan sedikit user
  
Oleh karena itu, perlu dilakukan pembobotan rating sebagaimana rumus dibawah ini:  
\begin{equation}
\text Weighted Rating (\bf WR) = \left({{\bf v} \over {\bf v} + {\bf m}} \cdot R\right) + \left({{\bf m} \over {\bf v} + {\bf m}} \cdot C\right)
\end{equation}  
Keterangan:  
v = jumlah voters  
m = minimum votes yang dibutuhkan untuk masuk dalam list  
R = rata-rata rating  
C = mean atau rata-rata vote secara keseluruhan

In [4]:
# average rating in datasets
C = metadt.vote_average.mean()
print(C)

# calculate minimum number of vote (m)
# here i'm gonna using 90% percentile
m = metadt.vote_count.quantile(0.9)
print(m)

6.069160000000003
568.1000000000004


In [5]:
# filter movie with vote_count more than m
top_movies = metadt.copy().loc[metadt['vote_count'] > m]
print('shape:', metadt.shape)
print('shape:', top_movies.shape)

shape: (5000, 24)
shape: (500, 24)


In [6]:
def weighted_rating(data, m=m, C=C):
    v = data['vote_count']
    R = data['vote_average']
    return (v/(v+m) * R) + (m/(v+m) * C)

In [7]:
top_movies['weighted_rating'] = top_movies.apply(weighted_rating, axis=1)

In [8]:
top_movies = top_movies.sort_values('weighted_rating', ascending=False)
print('shape:', top_movies.shape)
top_movies[['title', 'vote_count', 'vote_average', 'weighted_rating']].head(10)

shape: (500, 25)


Unnamed: 0,title,vote_count,vote_average,weighted_rating
314,The Shawshank Redemption,8358.0,8.5,8.34529
834,The Godfather,6024.0,8.5,8.290513
2843,Fight Club,9678.0,8.3,8.17631
292,Pulp Fiction,8670.0,8.3,8.162814
351,Forrest Gump,8147.0,8.2,8.0611
522,Schindler's List,4436.0,8.3,8.04674
1154,The Empire Strikes Back,5998.0,8.2,8.015639
2211,Life Is Beautiful,3643.0,8.3,7.999048
1178,The Godfather: Part II,3418.0,8.3,7.98206
289,Leon: The Professional,4293.0,8.2,7.950976


# Content-based Recommender
Natural Language Processing (TF-IDF) using **overview** feature on dataset

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create TF-IDF object and remove all stop_word like 'and', 'or', 'the', etc.
tfidf = TfidfVectorizer(stop_words='english')

metadt['overview'] = metadt['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(metadt['overview'])

tfidf_matrix.shape

(5000, 22304)

Selanjutnya hitung nilai similarity. Score similarity dapat ditentukan menggunakan manhattan distance, euclidean distance, pearson atau cosine similarity. Berikut merupakan rumus untuk menghitung cosine similarity:
<img src='https://res.cloudinary.com/dyd911kmh/image/upload/f_auto,q_auto:best/v1590782185/cos_aalkpq.png'/>
  
kita akan menggunakan sklearn **linear_kernel()** karena lebih cepat dibandingkan **cosine_similarity()**

In [10]:
from sklearn.metrics.pairwise import linear_kernel
cos_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cos_sim.shape)
cos_sim[1]

(5000, 5000)


array([0.0176179 , 1.        , 0.04691288, ..., 0.        , 0.        ,
       0.        ])

In [11]:
# reverse mapping title and index for recommender function
indices = pd.Series(metadt.index, index=metadt['title']).drop_duplicates()

In [12]:
def get_recommendation(title, cos_sim=cos_sim):
    ind = indices[title] # find index for title
    sim = list(enumerate(cos_sim[ind])) # enumerate all cosine similarity for the title
    sim = sorted(sim, key=lambda x: x[1], reverse=True) # sorted by second column (cosine similarity)
    sim = sim[1:10] # get top 10 highest similarity
    movie_indices = [ x[0] for x in sim ]
    return metadt['title'].loc[movie_indices]

In [13]:
title_sample = metadt.iloc[0]['title']
print(title_sample)
get_recommendation(title_sample)

Toy Story


2997              Toy Story 2
1071    Rebel Without a Cause
3057          Man on the Moon
485                    Malice
1932                Condorman
448         For Love or Money
1032            The Sunchaser
2157        Indecent Proposal
3252          Bound for Glory
Name: title, dtype: object

## add more feature recommendation

In [14]:
credits = pd.read_csv('./datasets/credits.csv')
keywords = pd.read_csv('./datasets/keywords.csv')

In [15]:
def checkInteger(data):
    try:
        int(data)
    except ValueError:
        return True

bad_id = [x for x in metadt.id if checkInteger(x)]
print(bad_id)
index_id = metadt.loc[metadt['id'].isin(bad_id)].index
metadt.drop(index_id, axis=0, inplace=True)

[]


In [16]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadt['id'] = metadt['id'].astype('int')

#merge keywords and credits to metadt
metadt = metadt.merge(keywords, on='id')
metadt = metadt.merge(credits, on='id')
metadt.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [17]:
# stringified list
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadt[feature] = metadt[feature].apply(literal_eval)

In [18]:
import numpy as np

In [19]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [20]:
# get top 3 from instance (crews, keywords, genres)
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [21]:
# apply get_director and get_list
metadt['director'] = metadt['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadt[feature] = metadt[feature].apply(get_list)
    
metadt[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [22]:
# strip and lower case all string
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''

In [23]:
# apply clean data
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadt[feature] = metadt[feature].apply(clean_data)

In [24]:
# combine all data needed to string
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [25]:
# apply soup
metadt['soup'] = metadt.apply(create_soup, axis=1)
metadt[['soup']].head(3)

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger walter...


In [26]:
# recommending section
# Director, genre, cast on relatively more movies doesn't affect their presence
# so we used CountVectorizer instead of TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadt['soup'])
count_matrix.shape

(5021, 12184)

In [27]:
# measure distance using cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

cos_sim2 = cosine_similarity(count_matrix, count_matrix)

In [28]:
# reset index and remapping like before
metadt = metadt.reset_index()
indices = pd.Series(metadt.index, index=metadt['title'])

In [29]:
# get recommendation
title_sample = metadt.iloc[0]['title']
print(title_sample)
get_recommendation(title_sample, cos_sim2)

Toy Story


3012             Toy Story 2
3324       Creature Comforts
1730        Meet the Deedles
1123      The Wrong Trousers
1429         Jungle 2 Jungle
734            A Close Shave
405     Addams Family Values
581                  Aladdin
608           The Aristocats
Name: title, dtype: object