In [23]:
import pandas as pd

# Three most common Recommender System

## Demographic Filtering

- Rekomendasi yang general (tidak sepenuhnya personalized) → Top 50 movies of the year
- Filter hanya berdasarkan fitur demographic seperti genre, durasi, dll.
- The simplest recommender system
Ide: Rekomendasikan apa yang secara umum disukai orang.

## Content Based Filtering

- Rekomendasi barang yang serupa → Other movies you may like
- Filter menggunakan fitur yang lebih spesifik seperti genre, film director, synopsis, aktor, dll.
- Ide: Jika seseorang menonton X, maka dia akan direkomendasikan film yang mirip dengan X.

## Collaborative Filtering

- Mencocokkan orang dengan preferensi yang serupa → Other people also watched
- Tidak membutuhkan filter data apapun, hanya perlu mencari similarity dengan yang lain.

# Simple Demographic Filtering: Filter -> Scoring -> Sort

In [24]:
df = pd.read_csv('data/demographic.csv')
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [25]:
df.sort_values('vote_average', ascending=False)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
30718,Mortal Kombat: The Journey Begins,Action; Animation,54.0,10.0,1.0,1995,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
31691,Canal Zone,Documentary,174.0,10.0,1.0,1977,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44549,Lad: A Yorkshire Story,,96.0,10.0,1.0,2013,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37671,Bowery Battalion,Comedy,69.0,10.0,1.0,1951,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
43186,Firefly,Comedy,84.0,10.0,1.0,1975,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32120,The Collection,,64.0,0.0,0.0,1976,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21585,The Wet Parade,Drama; History; Romance,118.0,0.0,1.0,1932,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
21582,The Story of Maths,Documentary,240.0,0.0,0.0,2008,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21537,Setouchi Moonlight Serenade,Drama,117.0,0.0,0.0,1997,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df.sort_values('vote_count', ascending=False)

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
15459,Inception,Action; Thriller; Science Fiction; Mystery; Ad...,148.0,8.1,14075.0,2010,1,1,0,0,...,0,0,0,1,0,1,0,1,0,0
12470,The Dark Knight,Drama; Action; Crime; Thriller,152.0,8.3,12269.0,2008,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
14535,Avatar,Action; Adventure; Fantasy; Science Fiction,162.0,7.2,12114.0,2009,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
17789,The Avengers,Science Fiction; Action; Adventure,143.0,7.4,12000.0,2012,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0
26480,Deadpool,Action; Adventure; Comedy,108.0,7.4,11444.0,2016,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36624,Love After Love,Romance; Drama,104.0,0.0,0.0,1992,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
36623,Fear,Documentary,92.0,0.0,0.0,2009,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36612,Shark in the Head,Drama; Comedy,75.0,0.0,0.0,2004,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
36604,"Everything's Fine, We're Leaving",Comedy; Drama,96.0,0.0,0.0,2000,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Step 1: Filter

In [27]:
genre = ['Animation']
duration = (60, 150)
year = (2000, 2019)
topk = 20

In [28]:

df = df[df.release_year.between(year[0], year[1]) &
        df.runtime.between(duration[0], duration[1]) &
        df[genre].all(axis=1)] # all semua yg true

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
3161,The Tigger Movie,Animation; Family,77.0,6.3,146.0,2000,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3354,The Road to El Dorado,Adventure; Animation; Comedy; Family,89.0,7.0,892.0,2000,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3484,Dinosaur,Animation; Family,82.0,6.2,563.0,2000,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3614,Titan A.E.,Animation; Action; Science Fiction; Family; Ad...,94.0,6.3,320.0,2000,1,1,1,0,...,0,0,0,0,0,1,0,0,0,0
3619,Chicken Run,Animation; Comedy; Family,84.0,6.5,1190.0,2000,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


# Step 2: Scoring

kita tinggal pakai vote_average sebagai score

# Step 3: Sort

In [29]:
recommendation = df.loc[:, "title":"release_year"]
recommendation = recommendation.sort_values("vote_average", ascending=False).head(topk)
recommendation


Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
25671,Rocks in my Pockets,Comedy; Animation; Drama,88.0,9.4,5.0,2014
41890,Kizumonogatari Part 3: Reiketsu,Animation; Fantasy; Mystery,82.0,9.4,5.0,2017
26636,Lotte from Gadgetville,Adventure; Animation; Comedy; Family,81.0,9.0,4.0,2006
41846,Kizumonogatari Part 2: Nekketsu,Animation; Fantasy; Mystery,68.0,8.9,11.0,2016
35577,"Fuse, Memoirs of the Hunter Girl",Action; Animation; Drama; History,110.0,8.8,4.0,2012
43038,In This Corner of the World,Animation; Drama,128.0,8.7,19.0,2016
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
28618,The Life and Mind of Mark DeFriest,Animation; Documentary,92.0,8.5,2.0,2014
41695,The Snow Queen 3: Fire and Ice,Family; Animation; Fantasy,80.0,8.5,2.0,2016
26159,The Littlest Angel,Animation; Family,84.0,8.5,2.0,2011


## IMDB weigthed Rating
Deskripsi: Film dengan 100 orang yang memberi rating 9.5 seakan-akan lebih baik daripada 10,000 orang yang memberi rating 9.0. Seharusnya, rating dibobot dengan jumlah orang yang vote juga. Kita akan menggunakan formula IMDB weighted rating berikut:

𝑊 = Rv+Cm/v+m
​ 

Keterangan:

- v: jumlah voting film tersebut → (vote_count)
- m: syarat minimum vote → akan menggunakan quantile
- R: rata-rata rating film itu → (vote_average)
- C: rata-rata rating semua film → dihitung dari data

In [36]:
def imdb_score(df, q=0.9):
    df = df.copy()
    
    m = df.vote_count.quantile(q)
    C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()
    
    df = df[df.vote_count >= m]
    # imdb weight
    df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C * m) / (x.vote_count + m), axis=1)
    
    return df


In [34]:
df.vote_count.quantile(0.85)

507.1999999999972

In [35]:
C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()
C

6.84418319650027

In [37]:
df = imdb_score(df)

In [38]:
recommendation = df.loc[:, "title":"release_year"]
recommendation = recommendation.sort_values("vote_average", ascending=False).head(topk)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
5471,Spirited Away,Fantasy; Adventure; Animation; Family,125.0,8.3,3968.0,2001
9687,Howl's Moving Castle,Fantasy; Animation; Adventure,119.0,8.2,2049.0,2004
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
12693,WALL·E,Animation; Family,98.0,7.8,6439.0,2008
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
15328,Toy Story 3,Animation; Family; Comedy,103.0,7.6,4710.0,2010
23489,How to Train Your Dragon 2,Fantasy; Action; Adventure; Animation; Comedy;...,102.0,7.6,3163.0,2014


# ML Engineering: Sum them all up

In [40]:
class RecommenderSystem:
    def __init__(self, data):
        self.df = pd.read_csv(data)
        
    def recommend(self, genre=None, duration=None, year=None, topk=10):
        df = self.df.copy()
        df = self.demographic_filter(df, genre=genre, duration=duration, year=year)
        df = self.compute_imdb_score(df)

        result = df.loc[:, "title":"release_year"]
        result = result.sort_values("vote_average", ascending=False)
        result = result.head(topk)
        return result
    
    @staticmethod # gak mau pakai self, ini fungsi terpisah dari class namun mau tetap dibungkus didalam class
    def demographic_filter(df, genre=None, duration=None, year=None):
        df = df.copy()
        
        if genre is not None:
            df = df[df[genre].all(axis=1)]
        if duration is not None:
            df = df[df.runtime.between(duration[0], duration[1])]
        if year is not None:
            df = df[df.release_year.between(year[0], year[1])]
        return df
    
    @staticmethod
    def compute_imdb_score(df, q=0.9):
        df = df.copy()
        
        m = df.vote_count.quantile(q)
        C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum()
        
        df = df[df.vote_count >= m]
        df["score"] = df.apply(lambda x: (x.vote_average * x.vote_count + C * m) / (x.vote_count + m), axis=1)
        return df


In [41]:
recsys = RecommenderSystem(data='data/demographic.csv')

In [42]:
recsys.recommend(genre=['Animation', 'Family'], duration=(60, 150), year=(2015, 2020))

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
41203,Moana,Adventure; Animation; Family,107.0,7.3,3471.0,2016
37969,Finding Dory,Adventure; Animation; Comedy; Family,97.0,6.8,4333.0,2016
41433,Sing,Animation; Comedy; Drama; Family; Music,108.0,6.8,2363.0,2016
30588,Minions,Family; Animation; Adventure; Comedy,91.0,6.4,4729.0,2015
30388,The Secret Life of Pets,Animation; Family,87.0,5.9,3536.0,2016
