## The Knowledge based recommender system
#### 1) Ask the user for the generes of movies he/she looking for
#### 2) Ask the user for duration
#### 3) Ask the user for the timeline of the movies recommended
#### 4) Using the information collected, recommend movies to the user that have a high weighted rating (according to the IMDB formula) and that satisfy the preceding conditions.

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("movies_metadata.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [4]:
df = df[["title","genres","release_date","runtime","vote_average","vote_count"]]

In [5]:
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


In [6]:
df["year"] = df["release_date"].apply(lambda x: str(x).split("-")[0] if x != np.nan else np.nan)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   genres        45466 non-null  object 
 2   release_date  45379 non-null  object 
 3   runtime       45203 non-null  float64
 4   vote_average  45460 non-null  float64
 5   vote_count    45460 non-null  float64
 6   year          45466 non-null  object 
dtypes: float64(3), object(4)
memory usage: 1.7+ MB


In [8]:
# Helper function to convert Nan to 0 and all other years to integers
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

In [9]:
# Apply convert _into the year feature
df["year"] = df["year"].apply(convert_int)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   genres        45466 non-null  object 
 2   release_date  45379 non-null  object 
 3   runtime       45203 non-null  float64
 4   vote_average  45460 non-null  float64
 5   vote_count    45460 non-null  float64
 6   year          45466 non-null  int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 1.9+ MB


In [11]:
# Drop the release date column
df = df.drop("release_date", axis=1)

In [12]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


In [13]:
df.iloc[0]["genres"]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [14]:
# Import the literal_aval function from ast
from ast import literal_eval

# Define a stringified list and output its type
a = "[1,2,3]"
print(type(a))

# Apply literal_eval and output type
b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


In [15]:
b,a

([1, 2, 3], '[1,2,3]')

In [16]:
# Convert all Nan into stringified empty lists
df["genres"] = df["genres"].fillna("[]")

# Apply literal_eval to convert to the list object
df["genres"] = df["genres"].apply(literal_eval)

# Convert list of dictionaries to a list of strings
df["genres"] = df["genres"].apply(lambda x: [i["name"] for i in x] if isinstance(x, list) else [])

In [17]:
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[Animation, Comedy, Family]",81.0,7.7,5415.0,1995
1,Jumanji,"[Adventure, Fantasy, Family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[Romance, Comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[Comedy, Drama, Romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[Comedy],106.0,5.7,173.0,1995


In [18]:
# Create a new feature by exploding genres

s = df.apply(lambda x: pd.Series(x["genres"]), axis=1).stack().reset_index(level=1, drop=True)
s

  s = df.apply(lambda x: pd.Series(x["genres"]), axis=1).stack().reset_index(level=1, drop=True)


0        Animation
0           Comedy
0           Family
1        Adventure
1          Fantasy
           ...    
45461       Family
45462        Drama
45463       Action
45463        Drama
45463     Thriller
Length: 91106, dtype: object

In [19]:
# Name the new feature a genre
s.name = "genre"

#Creating a new dataframe gen_df which by dropping a old genres feature and adding the new genre
gen_df = df.drop("genres", axis=1).join(s)

# Print the head of the new gen_df
gen_df.head()

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,Animation
0,Toy Story,81.0,7.7,5415.0,1995,Comedy
0,Toy Story,81.0,7.7,5415.0,1995,Family
1,Jumanji,104.0,6.9,2413.0,1995,Adventure
1,Jumanji,104.0,6.9,2413.0,1995,Fantasy


In [20]:
"""
1> Get user input on their preferences
2> Extract all movies that match the conditionsset by the user
3> Calculate the values of m and C for only these movies and proceed to build the charts as in the previous section
"""

'\n1> Get user input on their preferences\n2> Extract all movies that match the conditionsset by the user\n3> Calculate the values of m and C for only these movies and proceed to build the charts as in the previous section\n'

In [32]:
def build_chart(gen_df, percentile=0.95):
    #Ask for preferred genres
    print("All Genres :", gen_df["genre"].value_counts().index)
    print("Select your preferred genres")
    genre = input()
    
    #Ask for lower limit duration
    print("Input shortest duration")
    low_time = int(input())
    
    #Ask for upper_limit of timeline
    print("Input longest duration")
    high_time = int(input())
    
    # Ask for lower limit of timeline
    print("Input earliest year")
    low_year = int(input())
    
    # Ask for upper limit of timeline
    print("Input latest year")
    high_year = int(input())
    
    # define a new movies variable to store the preferred movies. Copy the content of gen_df to movies
    movies = gen_df.copy()
    
    #Filter based on the condition
    movies = movies[(movies["genre"] == genre) & (movies["runtime"] >= low_time) &
                   (movies["runtime"] <= high_time) & (movies["year"] >= low_year) &
                   (movies["year"] <= high_year)] 
    
    # Compute the values of C and m for the filtered movies 
    C = movies["vote_average"].mean()
    m = movies["vote_count"].quantile(percentile)
    
    #Only consider movies that have higher than m votes. Save this in a new dataframe q_movies
    q_movies = movies.copy().loc[movies["vote_count"] >= m]
    
    # Calculate the score using the IMDB formula
    q_movies["score"] = q_movies.apply(lambda x:
                                       (x["vote_count"]/(x["vote_count"]+m)*x["vote_average"])+(m/(m+x["vote_count"])* C),axis=1)
    
    # Sort movies in decsending order of their scores
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

In [33]:
build_chart(gen_df).head(10)

All Genres : Index(['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime',
       'Documentary', 'Adventure', 'Science Fiction', 'Family', 'Mystery',
       'Fantasy', 'Animation', 'Foreign', 'Music', 'History', 'War', 'Western',
       'TV Movie', 'The Cartel', 'Aniplex', 'Rogue State',
       'Carousel Productions', 'Telescene Film Group Productions',
       'Vision View Entertainment', 'Sentai Filmworks', 'Odyssey Media',
       'GoHands', 'BROSTA TV', 'Mardock Scramble Production Committee',
       'Pulser Productions'],
      dtype='object')
Select your preferred genres
Action
Input shortest duration
30
Input longest duration
120
Input earliest year
1996
Input latest year
2004


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
6725,Kill Bill: Vol. 1,111.0,7.7,5091.0,2003,Action,7.353134
9430,Oldboy,120.0,8.0,2000.0,2003,Action,7.190859
8234,The Incredibles,115.0,7.4,5290.0,2004,Action,7.108285
5284,The Bourne Identity,119.0,7.3,3640.0,2002,Action,6.922485
7938,The Bourne Supremacy,108.0,7.2,2873.0,2004,Action,6.769985
1508,Men in Black,98.0,6.9,4521.0,1997,Action,6.650729
3671,X-Men,104.0,6.8,4172.0,2000,Action,6.55151
3902,"O Brother, Where Art Thou?",106.0,7.3,1144.0,2000,Action,6.463654
7917,"I, Robot",115.0,6.7,3889.0,2004,Action,6.455812
638,Mission: Impossible,110.0,6.7,2677.0,1996,Action,6.373296


In [27]:
build_chart(gen_df).head(10)

All Genres : Index(['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime',
       'Documentary', 'Adventure', 'Science Fiction', 'Family', 'Mystery',
       'Fantasy', 'Animation', 'Foreign', 'Music', 'History', 'War', 'Western',
       'TV Movie', 'The Cartel', 'Aniplex', 'Rogue State',
       'Carousel Productions', 'Telescene Film Group Productions',
       'Vision View Entertainment', 'Sentai Filmworks', 'Odyssey Media',
       'GoHands', 'BROSTA TV', 'Mardock Scramble Production Committee',
       'Pulser Productions'],
      dtype='object')
Select your preferred genres
Adventure
Input shortest duration
30
Input longest duration
120
Input earliest year
1990
Input latest year
2005


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
9698,Howl's Moving Castle,119.0,8.2,2049.0,2004,Adventure,7.867861
1798,Mulan,88.0,7.6,2089.0,1998,Adventure,7.349419
8234,The Incredibles,115.0,7.4,5290.0,2004,Adventure,7.303667
2646,The Iron Giant,86.0,7.6,1470.0,1999,Adventure,7.261934
581,Aladdin,90.0,7.4,3495.0,1992,Adventure,7.258138
4178,Shrek,90.0,7.3,4183.0,2001,Adventure,7.186649
10240,Serenity,119.0,7.4,1287.0,2005,Adventure,7.061138
1495,Hercules,93.0,7.3,1741.0,1997,Adventure,7.051228
5084,Ice Age,81.0,7.1,3954.0,2002,Adventure,6.994787
1903,Back to the Future Part III,118.0,7.1,2978.0,1990,Adventure,6.96349


In [28]:
build_chart(gen_df).head(20)

All Genres : Index(['Drama', 'Comedy', 'Thriller', 'Romance', 'Action', 'Horror', 'Crime',
       'Documentary', 'Adventure', 'Science Fiction', 'Family', 'Mystery',
       'Fantasy', 'Animation', 'Foreign', 'Music', 'History', 'War', 'Western',
       'TV Movie', 'The Cartel', 'Aniplex', 'Rogue State',
       'Carousel Productions', 'Telescene Film Group Productions',
       'Vision View Entertainment', 'Sentai Filmworks', 'Odyssey Media',
       'GoHands', 'BROSTA TV', 'Mardock Scramble Production Committee',
       'Pulser Productions'],
      dtype='object')
Select your preferred genres
Music
Input shortest duration
30
Input longest duration
100
Input earliest year
1996
Input latest year
2010


Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score
13480,Dr. Horrible's Sing-Along Blog,42.0,7.8,236.0,2008,Music,7.513425
11826,Once,85.0,7.4,457.0,2007,Music,7.273242
10385,Corpse Bride,77.0,7.2,1957.0,2005,Music,7.172479
13543,Anvil! The Story of Anvil,90.0,7.7,85.0,2008,Music,7.100563
2585,"South Park: Bigger, Longer & Uncut",81.0,7.1,916.0,1999,Music,7.046749
4513,Hedwig and the Angry Inch,95.0,7.4,81.0,2001,Music,6.880647
9448,Interstella 5555: The 5tory of the 5ecret 5tar...,68.0,7.4,79.0,2003,Music,6.871948
3042,Fantasia 2000,74.0,7.0,300.0,1999,Music,6.862252
2482,SLC Punk,97.0,7.3,93.0,1998,Music,6.85765
2281,The Prince of Egypt,99.0,6.9,847.0,1998,Music,6.851713


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   genres        45466 non-null  object 
 2   runtime       45203 non-null  float64
 3   vote_average  45460 non-null  float64
 4   vote_count    45460 non-null  float64
 5   year          45466 non-null  int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 1.7+ MB


In [31]:
df.to_csv("movies_metadata_clean.csv",index=False)