In [1]:
import pandas as pd
import numpy as np
from jcopml.utils import save_model

# Three most common Recommender System

- **Demographic Filtering**
    - Rekomendasi yang general (tidak sepenuhnya personalized) -> **Top 50 movies of the year**
    - Filter hanya berdasarkan fitur demographic seperti genre, durasi, dll.
    - The simplest recommender system
    - Ide: Rekomendasikan apa yang secara umum disukai orang

- **Content Based Filtering**
    - Rekomendasi barang yang serupa -> **Other movies you may like**
    - Filter menggunakan fitur yang lebih spesifik seperti genre, film director, synopsis, aktor, dll. 
    - Ide: Jika seseorang menonton X, maka dia akan direkomendasikan film yang mirip dengan X

- **Collaborative Filtering**
    - Mencocokkan orang dengan preferensi yang serupa -> **Other people also watched**
    - Tidak membutuhkan filter data apapun, hanya perlu mencari similarity dengan yang lain

In [2]:
df = pd.read_csv('./data/movies_metadata.csv', parse_dates=['release_date'])
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


# Preprocessing 

### Select columns 

In [3]:
df = df[['original_title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

In [4]:
df.replace('nan', np.nan, inplace=True)

In [5]:
df.dropna(inplace=True)

## Check release_date column

In [6]:
def get_year(data):
    data = data.split('-')[0]
    return data

In [9]:
df['release_year'] = df['release_date'].apply(get_year)
df.drop(columns='release_date', inplace=True)

## Parsing genre data 

In [10]:
df['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [11]:
import json 

In [12]:
# format json kurang tepat, untuk keys harusnya menggunakan quotation mark ""
json.loads([{"id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]")

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 3 (char 2)

In [22]:
x = json.loads(df['genres'][0].replace("'", '"'))
x

[{'id': 16, 'name': 'Animation'},
 {'id': 35, 'name': 'Comedy'},
 {'id': 10751, 'name': 'Family'}]

In [19]:
x[2]['name']

'Family'

In [20]:
# buat fungsi untuk replace quotation mark biar bisa dibaca sama json.loads
def get_genre(data):
    """
    Data refer to df['genres'].
    How to use:
    df['genres'].apply(get_genre)
    """
    data = json.loads(data.replace("'", '"'))
    genre = [datum['name'] for datum in data]
    return '; '.join(genre)

In [23]:
df['genres'] = df['genres'].apply(get_genre)
df.head()

Unnamed: 0,original_title,genres,runtime,vote_average,vote_count,release_year
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995


## Create one hot encoding / dummy variables for genres 

In [25]:
dummy_genre = df['genres'].str.get_dummies('; ')
dummy_genre

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45460,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
45462,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
45463,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
45464,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [26]:
df = df.join(dummy_genre)
df.head()

Unnamed: 0,original_title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Demographic Filtering

### Step 1: Filter data 

In [29]:
genres = ['Animation', 'Family']
min_runtime = 60
max_runtime = 120
min_year = 2010
max_year = 2020
topN = 10

In [31]:
df.head(1)

Unnamed: 0,original_title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [43]:
# ganti tipe data menjadi integer supaya bisa di query
df['release_year'] = df['release_year'].astype(int)

In [45]:
def filter_movie(df, genres, min_runtime, max_runtime, min_year, max_year, topN):
    filtered_df = df.query('(@max_runtime >= runtime >= @min_runtime) & (@max_year >= release_year >= @min_year)')
    for genre in genres:
        filtered_df = filtered_df[filtered_df[genre] == 1]
    return filtered_df

In [47]:
filtered_movie = filter_movie(df=df, genres=genres, min_runtime=min_runtime, max_runtime=max_runtime, min_year=min_year, max_year=max_year, topN=topN)

In [49]:
filtered_movie.sort_values('vote_average', ascending=False)[:10]

Unnamed: 0,original_title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
26243,The Littlest Angel,Animation; Family,84.0,8.5,2.0,2011,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
41957,Снежная королева 3. Огонь и лед,Family; Animation; Fantasy,80.0,8.5,2.0,2016,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
40472,Over the Garden Wall,Adventure; Animation; Family; Fantasy,110.0,8.2,52.0,2014,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
25044,Song of the Sea,Family; Animation; Fantasy,93.0,8.1,420.0,2014,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
20779,おおかみこどもの雨と雪,Animation; Drama; Family; Fantasy,117.0,8.0,483.0,2012,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
30315,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
24455,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
40681,Ma vie de courgette,Animation; Drama; Family,66.0,7.7,215.0,2016,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
36253,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
40015,Kubo and the Two Strings,Animation; Adventure; Family,102.0,7.7,982.0,2016,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0


## Step 2: weighted rating 

IMDB Rating Formula

Weighted Rating $= \frac{Rv+Cm}{v+m}$

- $v$: jumlah voting film tersebut
- $m$: syarat minimum vote
- $R$: rata-rata rating film itu
- $C$: rata-rata rating semua film

What is the correlation between our data
- $v$: ready -> vote_count
- $R$: ready -> vote_average
- $m$: we decide this
- $c$: can be calculated by average the data

In [58]:
round(filtered_movie['vote_count'].mean(),0)

724.0

In [82]:
m = round(filtered_movie['vote_count'].mean(),0)
C = (filtered_movie['vote_average'] * filtered_movie['vote_count']).sum() / filtered_movie['vote_count'].sum()

In [61]:
# filter movie berdasarkan nilai m (syarat minimum vote)
filtered_movie = filtered_movie[filtered_movie['vote_count'] >= m]

In [76]:
def weighted_rating(df):
    v = df['vote_count']
    R = df['vote_average']
    return((R*v) + (C*m)) / (v+m)

In [83]:
# kalkulasi score berdasarkan imdb rating
filtered_movie['score'] = filtered_movie.apply(weighted_rating, axis=1)

In [84]:
filtered_movie

Unnamed: 0,original_title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,score
14984,How to Train Your Dragon,Fantasy; Adventure; Animation; Family,98.0,7.5,4319.0,2010,0,1,1,0,...,0,0,0,0,0,0,0,0,0,7.288946
15348,Toy Story 3,Animation; Family; Comedy,103.0,7.6,4710.0,2010,0,0,1,1,...,0,0,0,0,0,0,0,0,0,7.36475
15372,Shrek Forever After,Comedy; Adventure; Fantasy; Animation; Family,93.0,6.0,2021.0,2010,0,1,1,1,...,0,0,0,0,0,0,0,0,0,6.526643
15472,Despicable Me,Animation; Family,95.0,7.1,6595.0,2010,0,0,1,0,...,0,0,0,0,0,0,0,0,0,7.054088
16058,Megamind,Animation; Action; Comedy; Family; Science Fic...,95.0,6.7,1968.0,2010,1,0,1,1,...,0,0,0,0,1,0,0,0,0,6.834869
16130,Tangled,Animation; Family,100.0,7.4,3419.0,2010,0,0,1,0,...,0,0,0,0,0,0,0,0,0,7.200466
16738,Rango,Animation; Comedy; Family; Western; Adventure,107.0,6.6,2094.0,2011,0,1,1,1,...,0,0,0,0,0,0,0,0,1,6.786556
16964,Rio,Animation; Adventure; Comedy; Family,96.0,6.5,2213.0,2011,0,1,1,1,...,0,0,0,0,0,0,0,0,0,6.73588
17213,Kung Fu Panda 2,Animation; Family,91.0,6.7,1925.0,2011,0,0,1,0,...,0,0,0,0,0,0,0,0,0,6.836153
17361,Cars 2,Animation; Family; Adventure; Comedy,106.0,5.8,2088.0,2011,0,1,1,1,...,0,0,0,0,0,0,0,0,0,6.429872


## Display result 

```python
genres = ['Animation', 'Family']
min_runtime = 60
max_runtime = 120
min_year = 2010
max_year = 2020
topN = 10
```

In [87]:
result = filtered_movie[['original_title', 'genres', 'runtime', 'release_year', 'score']]
result.sort_values('score', ascending=False)[:10]

Unnamed: 0,original_title,genres,runtime,release_year,score
30315,Inside Out,Drama; Comedy; Animation; Family,94.0,2015,7.632512
24455,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,2014,7.548204
36253,Zootopia,Animation; Adventure; Family; Comedy,108.0,2016,7.43825
15348,Toy Story 3,Animation; Family; Comedy,103.0,2010,7.36475
23557,How to Train Your Dragon 2,Fantasy; Action; Adventure; Animation; Comedy;...,102.0,2014,7.301512
14984,How to Train Your Dragon,Fantasy; Adventure; Animation; Family,98.0,2010,7.288946
22718,The Lego Movie,Adventure; Animation; Comedy; Family; Fantasy,100.0,2014,7.244956
16130,Tangled,Animation; Family,100.0,2010,7.200466
22110,Frozen,Animation; Adventure; Family,102.0,2013,7.182944
40015,Kubo and the Two Strings,Animation; Adventure; Family,102.0,2016,7.146839


# Sum up 

In [90]:
genres = ['Comedy', 'Family', 'Adventure']
min_runtime = 80
max_runtime = 110
min_year = 2015
max_year = 2020
topN = 5

In [89]:
def demographic_filtering(df, genres=None, min_runtime=None, max_runtime=None, min_year=None, max_year=None, topN=10):
    data = df.copy()
    
    #initial filter
    if min_runtime is not None:
        data = data.query('runtime >= @min_runtime')
    if max_runtime is not None:
        data = data.query('runtime <= @max_runtime')
    if min_year is not None:
        data = data.query('release_year >= @min_year')
    if max_year is not None:
        data = data.query('release_year <= @max_year')
    if genres is not None:
        for genre in genres:
            data = data[data[genre] == 1]
            
    # weighting
    m = round(data['vote_count'].mean(),0)
    C = (data['vote_average'] * data['vote_count']).sum() / data['vote_count'].sum()
    data = data[data['vote_count'] >= m]
    
    def weighted_rating(df):
        v = df['vote_count']
        R = df['vote_average']
        return((R*v) + (C*m)) / (v+m)
    
    data['score'] = data.apply(weighted_rating, axis=1)
    
    # show result
    result = data[['original_title', 'genres', 'runtime', 'release_year', 'score']]
    result.sort_values('score', ascending=False)[:topN]
    return result

In [91]:
demographic_filtering(df=df, max_runtime=max_runtime, min_runtime=min_runtime, max_year=max_year, min_year=min_year, genres=genres, topN=topN)

Unnamed: 0,original_title,genres,runtime,release_year,score
30700,Minions,Family; Animation; Adventure; Comedy,91.0,2015,6.453596
35387,Kung Fu Panda 3,Action; Adventure; Animation; Comedy; Family,95.0,2016,6.698217
36253,Zootopia,Animation; Adventure; Family; Comedy,108.0,2016,7.5248
38176,Finding Dory,Adventure; Animation; Comedy; Family,97.0,2016,6.779637
39334,Ice Age: Collision Course,Adventure; Animation; Family; Comedy,100.0,2016,6.117595
44009,Despicable Me 3,Action; Animation; Adventure; Family; Comedy,96.0,2017,6.370238


---

In [92]:
x = pd.read_csv('./data/demographic_with_overview.csv')
x.head(1)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,overview
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,"Led by Woody, Andy's toys live happily in his ..."


```python
genres = ['Comedy', 'Family', 'Adventure']
min_runtime = 80
max_runtime = 110
min_year = 2015
max_year = 2020
topN = 5
```

In [98]:
def demographic_filtering(df, genres=None, min_runtime=None, max_runtime=None, min_year=None, max_year=None, topN=10):
    data = df.copy()
    
    #initial filter
    if min_runtime is not None:
        data = data.query('runtime >= @min_runtime')
    if max_runtime is not None:
        data = data.query('runtime <= @max_runtime')
    if min_year is not None:
        data = data.query('release_year >= @min_year')
    if max_year is not None:
        data = data.query('release_year <= @max_year')
    if genres is not None:
        for genre in genres:
            data = data[data[genre] == 1]
            
    # weighting
    m = round(data['vote_count'].mean(),0)
    C = (data['vote_average'] * data['vote_count']).sum() / data['vote_count'].sum()
    data = data[data['vote_count'] >= m]
    
    def weighted_rating(df):
        v = df['vote_count']
        R = df['vote_average']
        return((R*v) + (C*m)) / (v+m)
    
    data['score'] = data.apply(weighted_rating, axis=1)
    
    # show result
    result = data[["title", "genres", "release_year", "runtime", "vote_average", "vote_count", "score", "overview"]]
    result.sort_values('score', ascending=False)[:topN]
    return result

In [99]:
data_x = demographic_filtering(df=x,genres=genres,max_runtime=max_runtime, min_runtime=min_runtime, max_year=max_year, min_year=min_year)

In [100]:
data_x.head(1)

Unnamed: 0,title,genres,release_year,runtime,vote_average,vote_count,score,overview
698,Aladdin,Animation; Family; Comedy; Adventure; Fantasy;...,1992,90.0,7.4,3495.0,7.282573,Princess Jasmine grows tired of being forced t...
