# Pre-processing and Modeling

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# read the cleaned movies data set
movies = pd.read_csv('/Users/Atabay/Desktop/Movie_Recommendation_System_data/Data/m_data_cleaned.csv')

In [3]:
movies.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,title,video,vote_average,vote_count
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Toy Story,False,7.7,5415.0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Jumanji,False,6.9,2413.0
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Grumpier Old Men,False,6.5,92.0
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Waiting to Exhale,False,6.1,34.0
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Father of the Bride Part II,False,5.7,173.0


In [4]:
movies.shape

(44035, 20)

In [5]:
credits = pd.read_csv('/Users/Atabay/Desktop/Movie_Recommendation_System_data/Data/c_data_cleaned.csv')

In [6]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


## Demographic Filtering

In the demographic filtering, I will use vote average as the only decisive factor to rank the movies. However, some movies have very low vote count which would make the ranking unfair. To solve the issue I will only include the movies with considerable vote avarage to my ranking.

I will use 99 percentile as my cut off for vote count because we have 44035 movies in our data and most of have very few votes which may cause ranking to be biased. 

In [7]:
m_99 = movies['vote_count'].quantile(0.99)
m_99

2279.3199999999924

In [8]:
r_99 = movies.copy().loc[movies['vote_count'] >= m_99]
r_99.shape

(441, 20)

We have 441 movies that have voting cote more than 2279. I will use only those movies in my ranking.

In [9]:
r_99 = r_99.sort_values('vote_average', ascending=False)

In [10]:
# Displaying top 20 movies
r_99[['title', 'vote_count', 'vote_average']].head(20)

Unnamed: 0,title,vote_count,vote_average
818,The Godfather,6024.0,8.5
311,The Shawshank Redemption,8358.0,8.5
5438,Spirited Away,3968.0,8.3
290,Pulp Fiction,8670.0,8.3
12408,The Dark Knight,12269.0,8.3
519,Schindler's List,4436.0,8.3
1133,One Flew Over the Cuckoo's Nest,3001.0,8.3
2178,Life Is Beautiful,3643.0,8.3
2809,Fight Club,9678.0,8.3
23321,Whiplash,4376.0,8.3


This ranking can be used for simple movie recommendation to users. It is not influenced by any user preference or choice therefore It will be basic and same for everyone.

## Content Based Filtering

### a. Plot based recommender 

I will build a content based filtering using similarity scores of words in overview column

In [21]:
movies.overview.head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [11]:
# Importing the neccessary library
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Defining a tdif vectorizer and removing all the english words stop words
tfidf = TfidfVectorizer(stop_words='english')

In [13]:
#Replacing NaN with an empty string
movies['overview'] = movies['overview'].fillna('')

In [14]:
#Constructing the required tfidf matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

In [18]:
tfidf_matrix.shape

(44035, 75375)

It seems that there are over 75000 words to describe 44035 movies

In [22]:
# Importing linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

We need to define a function taking movie title as an input and recommending 5 similar movies as an output

In [29]:
#Creating a reverse map of indexes and movie titles
indexes = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [30]:
indexes

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Robin Hood                     44030
Century of Birthing            44031
Betrayal                       44032
Satan Triumphant               44033
Queerama                       44034
Length: 44035, dtype: int64

#### Defining the recommendation function

In [34]:
# Function that takes in movie title as input and outputs the most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Index of the movie that matches the title
    index = indexes[title]

    # Pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indixes
    movie_indexes = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indexes]

In [40]:
get_recommendations('The Godfather')

1159               The Godfather: Part II
42711    The Godfather Trilogy: 1972-1990
1883              The Godfather: Part III
22780                          Blood Ties
11226                    Household Saints
Name: title, dtype: object

In [43]:
get_recommendations('Spirited Away')

16783              Berta's Motives
497                          North
4817     Jimmy Neutron: Boy Genius
14588                The Butterfly
14624                       Svampe
Name: title, dtype: object

The recommendation system does a good job returning movies with similar plot descriptions. However, we can increase the quality of the recommendation system by adding more metadata.