There are two basic types of recommendation systems:

1. Simple Recommendation System
2. Content-Based Recommendation System

The Simple Recommendation System, as the name suggests, is a system that relies only on ranking as the basis for its recommendations. It is commonly used in “Top 5 movies” lists, where the order is determined by factors such as the most votes, highest ratings, best sales, or other similar metrics.

In this case, we will use a combination of average rating and number of votes to create a new metric derived from these existing metrics, and then we will sort this metric from highest to lowest.

In [29]:
# Import library

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [4]:
url_1 = 'https://storage.googleapis.com/dqlab-dataset/title.basics.tsv'
url_2 = 'https://storage.googleapis.com/dqlab-dataset/title.ratings.tsv'

df_basic = pd.read_csv (url_1, sep = '\t')
df_rating = pd.read_csv (url_2, sep = '\t')

df_basic.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0221078,short,"Circle Dance, Ute Indians","Circle Dance, Ute Indians",0,1898,\N,\N,"Documentary,Short"
1,tt8862466,tvEpisode,"¡El #TeamOsos va con todo al ""Reality del amor""!","¡El #TeamOsos va con todo al ""Reality del amor""!",0,2018,\N,\N,"Comedy,Drama"
2,tt7157720,tvEpisode,Episode #3.41,Episode #3.41,0,2016,\N,29,"Comedy,Game-Show"
3,tt2974998,tvEpisode,Episode dated 16 May 1987,Episode dated 16 May 1987,0,1987,\N,\N,News
4,tt2903620,tvEpisode,Frances Bavier: Aunt Bee Retires,Frances Bavier: Aunt Bee Retires,0,1973,\N,\N,Documentary
5,tt0043745,short,Lion Down,Lion Down,0,1951,\N,7,"Animation,Comedy,Family"
6,tt1009561,tvEpisode,Saikyô no kikku chîmu,Saikyô no kikku chîmu,0,2006,\N,23,"Animation,Comedy,Drama"
7,tt5038380,tvMovie,(Working) Holiday,(Working) Holiday,0,2015,\N,23,Comedy
8,tt0167491,video,Wicked Covergirls,Wicked Covergirls,1,1998,\N,85,Adult
9,tt7201644,tvEpisode,Episode #1.33,Episode #1.33,0,2006,\N,45,"Drama,Romance"


In [None]:
df_rating.head(10)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1608
1,tt0000002,6.0,197
2,tt0000003,6.5,1285
3,tt0000004,6.1,121
4,tt0000005,6.1,2050
5,tt0000006,5.1,111
6,tt0000007,5.4,639
7,tt0000008,5.4,1760
8,tt0000009,5.8,136
9,tt0000010,6.9,5778


In [6]:
df_basic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9025 entries, 0 to 9024
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          9025 non-null   object
 1   titleType       9025 non-null   object
 2   primaryTitle    9011 non-null   object
 3   originalTitle   9011 non-null   object
 4   isAdult         9025 non-null   int64 
 5   startYear       9025 non-null   object
 6   endYear         9025 non-null   object
 7   runtimeMinutes  9025 non-null   object
 8   genres          9014 non-null   object
dtypes: int64(1), object(8)
memory usage: 634.7+ KB


In [7]:
df_basic.isnull().sum()

tconst             0
titleType          0
primaryTitle      14
originalTitle     14
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            11
dtype: int64

In [8]:
df_basic.loc[(df_basic['primaryTitle'].isnull()) | (df_basic['originalTitle'].isnull())]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
9000,tt10790040,tvEpisode,,,0,2019,\N,\N,\N
9001,tt10891902,tvEpisode,,,0,2020,\N,\N,Crime
9002,tt11737860,tvEpisode,,,0,2020,\N,\N,"Comedy,Drama,Romance"
9003,tt11737862,tvEpisode,,,0,2020,\N,\N,"Comedy,Drama,Romance"
9004,tt11737866,tvEpisode,,,0,2020,\N,\N,"Comedy,Drama,Romance"
9005,tt11737872,tvEpisode,,,0,2020,\N,\N,\N
9006,tt11737874,tvEpisode,,,0,2020,\N,\N,"Comedy,Drama,Romance"
9007,tt1971246,tvEpisode,,,0,2011,\N,\N,Biography
9008,tt2067043,tvEpisode,,,0,1965,\N,\N,Music
9009,tt4404732,tvEpisode,,,0,2015,\N,\N,Comedy


In [9]:
df_basic = df_basic.loc[(df_basic['primaryTitle'].notnull()) & (df_basic['originalTitle'].notnull())]

In [10]:
len(df_basic)

9011

In [11]:
df_basic.loc[df_basic['genres'].isnull()]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
9014,tt10233364,tvEpisode,Rolling in the Deep Dish\tRolling in the Deep ...,0,2019,\N,\N,Reality-TV,
9015,tt10925142,tvEpisode,The IMDb Show on Location: Star Wars Galaxy's ...,0,2019,\N,\N,Talk-Show,
9016,tt10970874,tvEpisode,Die Bauhaus-Stadt Tel Aviv - Vorbild für die M...,0,2019,\N,\N,\N,
9017,tt11670006,tvEpisode,...ein angenehmer Unbequemer...\t...ein angene...,0,1981,\N,\N,Documentary,
9018,tt11868642,tvEpisode,GGN Heavyweight Championship Lungs With Mike T...,0,2020,\N,\N,Talk-Show,
9019,tt2347742,tvEpisode,No sufras por la alergia esta primavera\tNo su...,0,2004,\N,\N,\N,
9020,tt3984412,tvEpisode,"I'm Not Going to Come Last, I'm Just Going to ...",0,2014,\N,\N,Reality-TV,
9021,tt8740950,tvEpisode,Weight Loss Resolution Restart - Ins & Outs of...,0,2015,\N,\N,Reality-TV,
9022,tt9822816,tvEpisode,Zwischen Vertuschung und Aufklärung - Missbrau...,0,2019,\N,\N,\N,
9023,tt9900062,tvEpisode,The Direction of Yuu's Love: Hings Aren't Goin...,0,1994,\N,\N,"Animation,Comedy,Drama",


In [12]:
df_basic.loc[df_basic['genres'].notnull()]

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0221078,short,"Circle Dance, Ute Indians","Circle Dance, Ute Indians",0,1898,\N,\N,"Documentary,Short"
1,tt8862466,tvEpisode,"¡El #TeamOsos va con todo al ""Reality del amor""!","¡El #TeamOsos va con todo al ""Reality del amor""!",0,2018,\N,\N,"Comedy,Drama"
2,tt7157720,tvEpisode,Episode #3.41,Episode #3.41,0,2016,\N,29,"Comedy,Game-Show"
3,tt2974998,tvEpisode,Episode dated 16 May 1987,Episode dated 16 May 1987,0,1987,\N,\N,News
4,tt2903620,tvEpisode,Frances Bavier: Aunt Bee Retires,Frances Bavier: Aunt Bee Retires,0,1973,\N,\N,Documentary
...,...,...,...,...,...,...,...,...,...
8995,tt1357878,tvEpisode,Poison,Poison,0,2004,\N,\N,Documentary
8996,tt2252371,tvEpisode,Episode dated 20 February 2012,Episode dated 20 February 2012,0,2012,\N,\N,Talk-Show
8997,tt6934076,tvEpisode,Episode #1.59,Episode #1.59,0,2012,\N,\N,Talk-Show
8998,tt11988828,tvEpisode,Episode #1.263,Episode #1.263,0,\N,\N,\N,Drama


In [13]:
print(len(df_basic))

9011


In [14]:
df_basic['startYear'] = df_basic['startYear'].replace('\\N', np.nan)
df_basic['startYear'] = df_basic['startYear'].astype('float64')
df_basic['startYear'].unique()[:5]

array([1898., 2018., 2016., 1987., 1973.])

In [15]:
df_basic['endYear'] = df_basic['endYear'].replace('\\N', np.nan)
df_basic['endYear'] = df_basic['endYear'].astype('float64')
df_basic['endYear'].unique()[:5]

array([  nan, 2005., 1955., 2006., 1999.])

In [16]:
df_basic['runtimeMinutes'] = df_basic['runtimeMinutes'].replace('\\N', np.nan)
df_basic['runtimeMinutes'] = df_basic['runtimeMinutes'].astype('string')
df_basic['runtimeMinutes'].unique()[:5]

<StringArray>
[<NA>, '29', '7', '23', '85']
Length: 5, dtype: string

In [17]:
def transform_to_list(x):
    if ',' in x:
        return x, split('.')
    else:
        return []
    df_basic['genres'] = df_basic['genres'].apply(lambda x:transform_to_list(x))

In [18]:
df_rating.head(10)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1608
1,tt0000002,6.0,197
2,tt0000003,6.5,1285
3,tt0000004,6.1,121
4,tt0000005,6.1,2050
5,tt0000006,5.1,111
6,tt0000007,5.4,639
7,tt0000008,5.4,1760
8,tt0000009,5.8,136
9,tt0000010,6.9,5778


In [19]:
df_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030009 entries, 0 to 1030008
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1030009 non-null  object 
 1   averageRating  1030009 non-null  float64
 2   numVotes       1030009 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 23.6+ MB


In [20]:
df = pd.merge(df_basic, df_rating, on = 'tconst', how = 'inner')

In [21]:
df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0043745,short,Lion Down,Lion Down,0,1951.0,,7,"Animation,Comedy,Family",7.1,459
1,tt0167491,video,Wicked Covergirls,Wicked Covergirls,1,1998.0,,85,Adult,5.7,7
2,tt6574096,tvEpisode,Shadow Play - Part 2,Shadow Play - Part 2,0,2017.0,,22,"Adventure,Animation,Comedy",8.5,240
3,tt6941700,tvEpisode,RuPaul Roast,RuPaul Roast,0,2017.0,,,Reality-TV,8.0,11
4,tt7305674,video,UCLA Track & Field Promo,UCLA Track & Field Promo,0,2017.0,,,"Short,Sport",9.7,7
...,...,...,...,...,...,...,...,...,...,...,...
1373,tt0290419,movie,Andru Kanda Mugam,Andru Kanda Mugam,0,1968.0,,164,\N,6.4,5
1374,tt0522596,tvEpisode,The Clampetts Play Cupid,The Clampetts Play Cupid,0,1967.0,,30,"Comedy,Family",7.5,38
1375,tt7393650,tvEpisode,High Hitler and His Nazi Supersoldiers,High Hitler and His Nazi Supersoldiers,0,2016.0,,,"Documentary,History",6.5,8
1376,tt10925142,tvEpisode,The IMDb Show on Location: Star Wars Galaxy's ...,0,2019,,,Talk-Show,,7.1,15


In [22]:
df = df.dropna(subset = ['startYear', 'runtimeMinutes'])

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1004 entries, 0 to 1374
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          1004 non-null   object 
 1   titleType       1004 non-null   object 
 2   primaryTitle    1004 non-null   object 
 3   originalTitle   1004 non-null   object 
 4   isAdult         1004 non-null   int64  
 5   startYear       1004 non-null   float64
 6   endYear         17 non-null     float64
 7   runtimeMinutes  1004 non-null   string 
 8   genres          1004 non-null   object 
 9   averageRating   1004 non-null   float64
 10  numVotes        1004 non-null   int64  
dtypes: float64(3), int64(2), object(5), string(1)
memory usage: 94.1+ KB


Task 5 - Building Simple Recommender System

Question 1: What is the value of C?

In [24]:
c =  df['averageRating'].mean()
c

np.float64(6.829581673306773)

Question 2: What is the value of m?

Let's take an example of a movie with numVotes above 80% of the population, so we will only take 20% of the population. 

In [25]:
m =  df['numVotes'].quantile(0.8)
m

np.float64(229.0)

Pertanyaan 3: Bagaimana cara membuat fungsi weighted formula?

In [26]:
def imdb_weighted_rating(df, var=0.8):
    v = df['numVotes']
    R = df['averageRating']
    C = df['averageRating'].mean()
    m = df['numVotes'].quantile(var)
    df['score'] = (v/(m+v))*R + (m/(m+v))*C #Rumus IMDb 
    return df['score']
    
imdb_weighted_rating(df)

#melakukan pengecekan dataframe
print(df.head())


      tconst  titleType          primaryTitle         originalTitle  isAdult  \
0  tt0043745      short             Lion Down             Lion Down        0   
1  tt0167491      video     Wicked Covergirls     Wicked Covergirls        1   
2  tt6574096  tvEpisode  Shadow Play - Part 2  Shadow Play - Part 2        0   
5  tt2262289      movie               The Pin               The Pin        0   
6  tt0874027  tvEpisode         Episode #32.9         Episode #32.9        0   

   startYear  endYear runtimeMinutes                      genres  \
0     1951.0      NaN              7     Animation,Comedy,Family   
1     1998.0      NaN             85                       Adult   
2     2017.0      NaN             22  Adventure,Animation,Comedy   
5     2013.0      NaN             85                       Drama   
6     2006.0      NaN             29       Comedy,Game-Show,News   

   averageRating  numVotes     score  
0            7.1       459  7.009992  
1            5.7         7  6.79

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['score'] = (v/(m+v))*R + (m/(m+v))*C #Rumus IMDb


Question 4: How to create a simple recommender system?

From the task we have done before, there is an additional field 'score'.
First, we will filter numVotes that are more than m then sort the score from highest to lowest to take the top few scores.

In [27]:
def simple_recommender(df, top=100):
    df = df.loc[df['numVotes'] >= m]
    df = df.sort_values(by='score', ascending=False) #urutkan dari nilai tertinggi ke terendah
    
    #Ambil data 100 teratas
    df = df[:top]
    return df
    
#Ambil data 25 teratas     
print(simple_recommender(df, top=25))


         tconst  titleType                                   primaryTitle  \
68    tt4110822  tvEpisode                                  S.O.S. Part 2   
236   tt2200252      video                    Attack of the Clones Review   
1181  tt7697962  tvEpisode            Chapter Seventeen: The Missionaries   
326   tt7124590  tvEpisode            Chapter Thirty-Four: Judgment Night   
1045  tt0533506  tvEpisode                                       The Prom   
71    tt8399426  tvEpisode                                        Savages   
1234  tt2843830  tvEpisode                                          VIII.   
1087  tt4295140   tvSeries                                   Chef's Table   
1054  tt2503932  tvEpisode                                Trial and Error   
448   tt0337566      video                       AC/DC: Live at Donington   
624   tt0620159  tvEpisode                                     Strike Out   
1281  tt3166390  tvEpisode                         Looking for a Plus-One   

Question 5: How to create a simple recommender system with user preferences?

From the task that has been done before, it can be seen that now the list of movies has been sorted from the highest score to the lowest.
Movies with a high averageRating do not always get a higher position than movies with a lower averageRating, this is because we also take into account the number of votes.

This recommendation system can still be improved by adding specific filters about titleType, startYear, or other filters.
The next work we will do is to create a function to filter by isAdult, startYear, and genres.

In [28]:
df_next = df.copy()

def user_prefer_recommender(df_next, ask_adult, ask_start_year, ask_genre, top=100):
    #ask_adult = yes/no
    if ask_adult.lower() == 'yes':
        df_next = df_next.loc[df['isAdult'] == 1]
    elif ask_adult.lower() == 'no':
        df_next = df_next.loc[df['isAdult'] == 0]

    #ask_start_year = numeric
    df_next = df_next.loc[df['startYear'] >= int(ask_start_year)]

    #ask_genre = 'all' atau yang lain
    if ask_genre.lower() == 'all':
        df_next = df_next
    else:
        def filter_genre(x):
            if ask_genre.lower() in str(x).lower():
                return True
            else:
                return False
        df_next = df_next.loc[df['genres'].apply(lambda x: filter_genre(x))]

    df_next = df_next.loc[df['numVotes'] >= m]  #Mengambil film dengan numVotes yang lebih besar atau sama dengan nilai m 
    df_next = df_next.sort_values(by='score', ascending=False)
    
    #jika kamu hanya ingin mengambil 100 teratas
    df_next = df_next[:top]
    return df_next

print(user_prefer_recommender(df,
                       ask_adult = 'no',
                        ask_start_year = 2000,
                       ask_genre = 'drama'
                       ))



         tconst  titleType                         primaryTitle  \
68    tt4110822  tvEpisode                        S.O.S. Part 2   
1181  tt7697962  tvEpisode  Chapter Seventeen: The Missionaries   
326   tt7124590  tvEpisode  Chapter Thirty-Four: Judgment Night   
71    tt8399426  tvEpisode                              Savages   
1234  tt2843830  tvEpisode                                VIII.   
...         ...        ...                                  ...   
1138  tt1126516      movie              Money Hai Toh Honey Hai   
1208  tt3044882      movie                                Nasha   
1197  tt3016748      movie                       Jacob's Ladder   
846   tt0488164      movie                          Moscow Zero   
90    tt0299981    tvMovie               Highlander: The Source   

                            originalTitle  isAdult  startYear  endYear  \
68                          S.O.S. Part 2        0     2015.0      NaN   
1181  Chapter Seventeen: The Missionaries      