# Movie recommender based on IMDB dataset

## Content based recommender system

In [1]:
#Importing packages
import warnings
warnings.simplefilter(action="ignore",category=FutureWarning)
import pandas as pd
import numpy as np

We have two different csv files one of them contains the ratings and the other one contains the movie genres.

In [2]:
#Importing datasets
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

Im gonna start with the movies dataset.

Lets take a quick look to gather insight

In [3]:
movies.sample(3)

Unnamed: 0,movieId,title,genres
9373,162602,The Girl on the Train (2016),Thriller
8734,127132,Zipper (2015),Drama|Thriller
7335,77893,Merantau (2009),Action|Drama


Does this dataset have any Null values?Lets find Out!

In [4]:
movies.isnull().mean() * 100

movieId    0.0
title      0.0
genres     0.0
dtype: float64

Fortunately theres is no need to handle missing values in the dataset.

In the first dataset there are three columns,each movie has a name movie id and genres.

Since this is a content based recommender system its important to have a genres matrix.

In [5]:
#Lets take a look at the column
movies.genres

0       Adventure|Animation|Children|Comedy|Fantasy
1                        Adventure|Children|Fantasy
2                                    Comedy|Romance
3                              Comedy|Drama|Romance
4                                            Comedy
                           ...                     
9737                Action|Animation|Comedy|Fantasy
9738                       Animation|Comedy|Fantasy
9739                                          Drama
9740                               Action|Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

Different genres are divided by "|" its easy to divide them and make a list of all movie genres.

In [6]:
#Seperate movie genres from one string into a list on strings.
genres_column = movies.genres.str.split("|")
#Add the new genres_column to the dataset.
movies["genres_column"]=genres_column
#Since we dont need the original genres column we can delete it from our dataset.
movies.drop("genres",axis=1,inplace=True)
#In this step the movies dont matter so we only keep the genres.
genres_column

0       [Adventure, Animation, Children, Comedy, Fantasy]
1                          [Adventure, Children, Fantasy]
2                                       [Comedy, Romance]
3                                [Comedy, Drama, Romance]
4                                                [Comedy]
                              ...                        
9737                 [Action, Animation, Comedy, Fantasy]
9738                         [Animation, Comedy, Fantasy]
9739                                              [Drama]
9740                                  [Action, Animation]
9741                                             [Comedy]
Name: genres, Length: 9742, dtype: object

We need to get rid of the duplicated genres names.

To do so we add all of them into a list and make it a set to remove duplicates.

In [7]:
genres = [] #For storing all instances.
for i in genres_column:
    genres.extend(i) #Add new movie instances genres to the genres list.

Now these are all the movie genres.

In [8]:
genres = list(set(genres))
print(genres)

['Horror', '(no genres listed)', 'Action', 'Animation', 'Documentary', 'Comedy', 'Sci-Fi', 'Western', 'War', 'Romance', 'Thriller', 'IMAX', 'Adventure', 'Drama', 'Musical', 'Children', 'Film-Noir', 'Crime', 'Mystery', 'Fantasy']


In [9]:
#Lets check the new movies dataset again
movies

Unnamed: 0,movieId,title,genres_column
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"


In [10]:
#Now is the time to make the movies by genres dataset.
movie_matrix = pd.DataFrame(movies[["movieId","title"]],columns=genres)
movie_matrix

Unnamed: 0,Horror,(no genres listed),Action,Animation,Documentary,Comedy,Sci-Fi,Western,War,Romance,Thriller,IMAX,Adventure,Drama,Musical,Children,Film-Noir,Crime,Mystery,Fantasy
0,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,,,,,,,,,,,,,,,,,,,,
9738,,,,,,,,,,,,,,,,,,,,
9739,,,,,,,,,,,,,,,,,,,,
9740,,,,,,,,,,,,,,,,,,,,


In [11]:
#Filling the matrix with ones if the movie includes that genre.
for i,value in enumerate(movies["genres_column"]): 
    for j in value:
        movie_matrix.at[i,j] = 1

In [12]:
movie_matrix

Unnamed: 0,Horror,(no genres listed),Action,Animation,Documentary,Comedy,Sci-Fi,Western,War,Romance,Thriller,IMAX,Adventure,Drama,Musical,Children,Film-Noir,Crime,Mystery,Fantasy
0,,,,1.0,,1.0,,,,,,,1.0,,,1.0,,,,1.0
1,,,,,,,,,,,,,1.0,,,1.0,,,,1.0
2,,,,,,1.0,,,,1.0,,,,,,,,,,
3,,,,,,1.0,,,,1.0,,,,1.0,,,,,,
4,,,,,,1.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,,,1.0,1.0,,1.0,,,,,,,,,,,,,,1.0
9738,,,,1.0,,1.0,,,,,,,,,,,,,,1.0
9739,,,,,,,,,,,,,,1.0,,,,,,
9740,,,1.0,1.0,,,,,,,,,,,,,,,,


In [13]:
#Making a new movies dataset but including all genres and filled with ones and zeros.
movies = movies[["movieId","title"]]
new_movies = pd.concat([movies,movie_matrix],axis=1)

In [14]:
new_movies.fillna(0,inplace=True)
new_movies

Unnamed: 0,movieId,title,Horror,(no genres listed),Action,Animation,Documentary,Comedy,Sci-Fi,Western,...,Thriller,IMAX,Adventure,Drama,Musical,Children,Film-Noir,Crime,Mystery,Fantasy
0,1,Toy Story (1995),0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9738,193583,No Game No Life: Zero (2017),0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9739,193585,Flint (2017),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We have made a new movie dataset which can be used for making user profiles.

In [15]:
#Now its time to take a look at the ratings dataset.

In [16]:
ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
63983,414,5541,3.0,1034345816
53433,352,94864,4.5,1493674430
89585,580,5319,3.5,1167790343
87950,567,82684,0.5,1525289916
7159,50,1278,3.5,1514238500


In [17]:
#This is an example how the code below selects all user ratings from the ratings dataset.
ratings.loc[ratings["userId"]==1,["userId","movieId","rating"]]

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
227,1,3744,4.0
228,1,3793,5.0
229,1,3809,4.0
230,1,4006,4.0


In [18]:
#Creating a user profiles.
users_rating = {}
users_profile = {}
for i in range (1,611):
    profile_df = pd.DataFrame(columns=genres)
    users_rating[i] = ratings.loc[ratings["userId"]==i,["userId","movieId","rating"]]
    movieId = list(users_rating.get(i)["movieId"])
    rating = list(users_rating.get(i)["rating"])
    for j,k in zip(movieId,rating):
        instance = new_movies.loc[new_movies["movieId"]==j,:]
        instance.drop(["movieId","title"],axis=1,inplace=True)
        instance = instance.to_numpy()
        instance = instance * k
        instance = pd.DataFrame(instance,columns=genres)
        profile_df = pd.concat([instance,profile_df],axis=0)
    sum_of_columns = []
    for column in profile_df:
        sum_of_columns.append(profile_df[column].sum())
    sum_of_all = sum(sum_of_columns)
    user_profile_list = sum_of_columns/sum_of_all
    users_profile[i] = user_profile_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


We have made user profiles for all users.

They can be used for movie recommendation.

In [20]:
#For example this is the user profile for user number 4.
users_profile.get(4)

array([0.00932529, 0.        , 0.04552935, 0.01316511, 0.00438837,
       0.20021942, 0.01865058, 0.02084476, 0.01371366, 0.10751509,
       0.07405376, 0.00164564, 0.05814591, 0.22929238, 0.03510697,
       0.02084476, 0.00877674, 0.05650027, 0.04388371, 0.03839824])