# Knowledge-based Recommender

https://tutorialedge.net/python/building-imdb-top-250-clone-pandas/

https://github.com/PacktPublishing/Hands-On-Recommendation-Systems-with-Python/blob/master/Chapter3/Knowledge%20Recommender.ipynb

https://www.kaggle.com/rounakbanik/the-movies-dataset

In [160]:
import pandas as pd
import numpy as np

path = 'data/MoviesDataset/'
poster_path_tmdb = 'https://image.tmdb.org/t/p/w500'

df = pd.read_csv(path + '/movies_metadata.csv', low_memory=False)

#Print all the features (or columns) of the DataFrame
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [150]:
#Drop movies without title
df = df[~df.title.isnull()]

In [151]:
#Only keep those features that we require
df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count', 'poster_path', 'overview']]

df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,poster_path,overview
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,/e64sOI48hQXyru7naBFyssKFxVd.jpg,Just when George Banks has recovered from his ...


### Extract year feature

In [152]:
#Extract year from release_date
df['year'] = df['release_date'].apply(lambda x: str(x)[:4] if x != np.nan else 0)

In [153]:
#Convert years to integers and 'nan' as 0
df['year'] = df['year'].apply(lambda x: int(x) if x != 'nan' else 0)

In [154]:
np.sort(df['year'].unique())

array([   0, 1874, 1878, 1883, 1887, 1888, 1890, 1891, 1892, 1893, 1894,
       1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1905,
       1906, 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1914, 1915, 1916,
       1917, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927,
       1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938,
       1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949,
       1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2020], dtype=int64)

### Genre

In [155]:
#One genre content
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [156]:
#Import the literal_eval function to convert genre
from ast import literal_eval

#Convert NaNs into string of empty lists
df['genres'] = df['genres'].fillna('[]')

#Apply literal_eval to convert string into list object
df['genres'] = df['genres'].apply(literal_eval)

#Convert list of dictionaries to a list of strings
df['genres'] = df['genres'].apply(lambda x: [g['name'] for g in x] if len(x)>0 else [])
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,poster_path,overview,year
0,Toy Story,"[Animation, Comedy, Family]",1995-10-30,81.0,7.7,5415.0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...",1995
1,Jumanji,"[Adventure, Fantasy, Family]",1995-12-15,104.0,6.9,2413.0,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...,1995
2,Grumpier Old Men,"[Romance, Comedy]",1995-12-22,101.0,6.5,92.0,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,A family wedding reignites the ancient feud be...,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",1995-12-22,127.0,6.1,34.0,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"Cheated on, mistreated and stepped on, the wom...",1995
4,Father of the Bride Part II,[Comedy],1995-02-10,106.0,5.7,173.0,/e64sOI48hQXyru7naBFyssKFxVd.jpg,Just when George Banks has recovered from his ...,1995


In [157]:
#Create a new feature by exploding genres
g = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

#Name the new feature as 'genre'
g.name = 'genre'

#Create a new dataframe gen_df which by dropping the old 'genres' feature and adding the new 'genre'.
gen_df = df.drop('genres', axis=1).join(g)

#Print the head of the new gen_df
gen_df.head()

  g = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)


Unnamed: 0,title,release_date,runtime,vote_average,vote_count,poster_path,overview,year,genre
0,Toy Story,1995-10-30,81.0,7.7,5415.0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...",1995,Animation
0,Toy Story,1995-10-30,81.0,7.7,5415.0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...",1995,Comedy
0,Toy Story,1995-10-30,81.0,7.7,5415.0,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...",1995,Family
1,Jumanji,1995-12-15,104.0,6.9,2413.0,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...,1995,Adventure
1,Jumanji,1995-12-15,104.0,6.9,2413.0,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...,1995,Fantasy


In [143]:
## Input from user in Streamlit form

percentile = 0.8

genre = 'Action'

low_time = int('80')
high_time = int('120')

low_year = int('1990')
high_year = int('2000')

In [158]:
def knowledge_recommender(df, percentile, genre, low_time, high_time, low_year, high_year):   
    #Define a new movies variable to store the preferred movies. Copy the contents to movies
    movies = df.copy()
    
    #Filter based on the condition
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    #Compute the values of C and m for the filtered movies
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    #Calculate score using the IMDB formula
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                                   + (m/(m+x['vote_count']) * C),
                                       axis=1)

    #Sort movies in descending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

In [159]:
#Generate the chart for top animation movies and display top 5.
knowledge_recommender(gen_df, percentile, genre, low_time, high_time, low_year, high_year).head()

Unnamed: 0,title,release_date,runtime,vote_average,vote_count,poster_path,overview,year,genre,score
723,Ghost in the Shell,1995-11-18,83.0,7.8,854.0,/9gC88zYUBARRSThcG93MvW14sqx.jpg,"In the year 2029, the barriers of our world ha...",1995,Action,7.521643
550,True Romance,1993-09-09,120.0,7.5,762.0,/xBO8R3CZfrJ9rrwrZoJ68PgJyAR.jpg,"Clarence marries hooker Alabama, steals cocain...",1993,Action,7.23198
3902,"O Brother, Where Art Thou?",2000-08-30,106.0,7.3,1144.0,/eIqSzq6j3yuxNmifiUOh6iTpG9N.jpg,"In the deep south during the 1930s, three esca...",2000,Action,7.131617
348,The Crow,1994-05-11,102.0,7.3,980.0,/vrQZSCP3WeYXCTXdJQ6mXgxb9L4.jpg,Exactly one year after young rock guitarist Er...,1994,Action,7.106412
3871,"Crouching Tiger, Hidden Dragon",2000-10-01,120.0,7.2,949.0,/2D7kikHUHbecJGoRAspBYiqoxd6.jpg,Two warriors in pursuit of a stolen sword and ...,2000,Action,7.011634
