In [None]:
import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv') 

In [None]:
movies.head(2)      #Gives info about the first two movies,in the dataset

In [None]:
movies.shape        #gives the dimensions of the matrix

In [None]:
credits.head(2)     #Gives info about the first two movie credits,in the dataset

In [None]:
credits.shape       #gives the dimensions of the matrix

In [None]:
"""
Shows all the attributes and values associated with the 
first movie's crew (in our case its avatar)
"""
credits.head(1)['crew'].values

In [None]:
movies = movies.merge(credits,on = 'title') #Merging the two datasets using the common column title(can also be joined as per id)

In [None]:
movies.shape                             #gives the dimensions of the merged dataset

In [None]:
movies.head(1)                           #columns have now increased, news column are added at the end

In [None]:
"""
Useful Recommending attributes

genre          :Theme based recommending
keywords       :Sometimes when You cant really name the movie, but you have words to describe
id             :For handling purposes
language       :(but in our dataset 4500+ are in english so, not very useful)
title          :Exact titles can result in highly accurate results
overview       :Similiar stories help in recommending them to the user
popularity     :Very important factor, though our approach currently avoids the numeric data
Release data   :For example some would like to watch movies from the 90s or 80s
cast           :Recommendation based on actors,actresses 
crew           :Recommendation based on directors

"""

In [None]:
#Keeping attributes necessary for creating tags for our data
movies = movies[['movie_id','title','genres','overview','keywords','cast','crew']]

In [None]:
#Preprocessing of data begins

#STEP 1 : Checking for Any missing data

movies.isnull().sum()       #Gives the summary of rows which have null values

In [None]:
movies.dropna(inplace=True)   #dropna() function is used to remove rows and columns with Null/NaN values. 

In [None]:
#STEP 2:  Checking for any duplicated data

movies.duplicated().sum()

In [None]:
#STEP 3: Customising and Refining data to get our tags for every movie in the dataset

import ast

In [None]:
#Three Helper Functions to ease the task of customising and refining data

def convert(data):                      #The data set is in string format
    List = []                           #We need a list
    for i in ast.literal_eval(data):    #Converting the string data into a list of tags
        List.append(i['name']) 
    return List 

In [None]:
"""
The method for converting the string data to list of tags, is same as that used for keywords and genres
But in the case for cast column, the idea is to give priority to the top 4 leading acters/actresses for recommendation
This will increase the efficiency and readability of the code(as well as the working matrix)
This is done to get the recommendation as per the first thought that the use gets when he/she hears the name of a movie
For example : If the user hears the name Iron Man, the first acter that will pop up in the user's mind will be 
              'Robert Downey Jr'
"""
def top_four_people(data):                      #The data set is in string format
    List = []
    counter = 0                                 # Counter to get the top 4 cast of the movie
    for i in ast.literal_eval(data):
        if counter < 4:
            List.append(i['name'])
        counter += 1
    return List

In [None]:
"""
For the case of Crew column the only need is to get the name of the director of the movie.
People usually don't remember who was the VFX expert, or who did the final editing, or who designed the sets
But people Do remember The Director in many cases
For Example, the momemnt User hears the name Justice League, the first is Snyder's Cut, Which actually gives the name
             Zack Snyder, the director of the Snyder cut
             
Proving Point : What was name of the head of the vfx team?
              : Like its mentioned, people dont remember :)

"""
def get_me_the_director(data):
    List = []
    for i in ast.literal_eval(data):
        if i['job'] == 'Director':
            List.append(i['name'])
    return List

In [None]:
#Refining our data with the help of helper functions

movies['genres']= movies['genres'].apply(convert)          #Provides the list of raw tags for all the movies containing
                                                           #the name-values in the genre column of the movies
    
movies['keywords']= movies['keywords'].apply(convert)      #Provides the list of raw tags for all the movies containing
                                                           #the name-values in the keywords column of the movies
    
movies['cast']= movies['cast'].apply(top_four_people)       #Provides the list of top four actors/actressses
                                                            #for all the movies
                                                              
movies['crew']= movies['crew'].apply(get_me_the_director)   #Provides the list of directors for all the movies

movies['overview'] = movies['overview'].apply(lambda x:x.split()) #Converts the overview string for each movie to a list 
                                                                  #containg all the words in the string

In [None]:
movies.head()             #Displaying all the changes done to refine our data, conversion to a list of tags

In [1]:
"""
STEP 4 : Transformation of data
Before creating the tags that can be used in the recommender system, the spaces between the raw tags has to be removed
So that no ambiguity arises when a raw tag which consists of a few words, is converted into usable tags where every word of
the raw-tag becomes and individual tag.

For Example : The system wants 'Science Fiction' as a one word tag 'ScienceFiction' and not as 'Science' and 'Fiction' 
              seperately.
            : This space is problem as now 'science' and 'fiction' tags will be used even when there might not be a need. 
            : Like in case of a real autobiography of a scientist, science will be a tag, but the user who watched a 
              science-fiction movie might not be interested in real stories. And still will receive the recommendation of 
              every movie which may or may not be fiction but related to science, just because 'science' and 'fiction' tags 
              were not dealt with properly.
Similair problem can occur when two actors sharing the first name are treated as an individual entity

Below is the code implementation to do the task

The lambda function used, removes the space between each element(if any) of the list supplied
"""


movies['cast'] = movies['cast'].apply(lambda x:[i.replace(' ','') for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(' ','') for i in x])
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(' ','') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(' ','') for i in x])

In [None]:
movies.head()         #Displaying data with the changes done to its spacing

In [None]:
"""
STEP 5 : Providing tags to our data

Now that the missing-data problem, duplicate data problem and space problems have been resolved, its time to get the refined
tags, which will be used by our data.
For this a new column containing the tags will be created and the five columns that were refined previously will be merged 
into it.
Following this step the five columns will be dropped(Not necessary but must be done), so as to improve the space utilisation
"""
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

#A new database with only three columns: movie_id,title and tags

Movie = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [None]:
Movie.head()         #Displaying data with the changes done to its spacing

In [None]:
#Now converting the merged lists in tags into one string data i.e. a group of tags
Movie['tags'] = Movie['tags'].apply(lambda x: " ".join(x))

#The mergeed lists are now combined into one group,below mentioned code displays the tags for first movie
Movie['tags'][0]

In [None]:
"""
The Natural Language Toolkit (NLTK) is used for processing the tags, to remove redundancy and to save on the space occupied by almost identical tags
PorterStemmer is used for stemming the tags
"""
import nltk
from nltk import PorterStemmer
ps = PorterStemmer()

In [None]:
#Function to stem the tags and return the list of tags

def stem(data):      
    List = []
    for i in data.split():
        List.append(ps.stem(i))
    return " ".join(List)

In [None]:
Movie['tags'] = Movie['tags'].apply(stem)   #Stemming the tags

In [None]:
"""
Now that tags are ready Vectorisation of data can be done.

The main task at hand is to find the similarities between the movies so that the recommender can act accordingly and 
can provide the appropriate.

To do this the entire dataset has to be vectorised where each movie reprents a point on a 2d graph 

And the recommender system will suggest the closest 'n' points, from a given point. 
To convert the text to vectors, our system will be using "Bag of Words Technique".

There are other advanced techniques but because it is our first project so we chose to use a simpler yet efficient technique.

While vectorisation, the stop words are to omitted, eg: words like is,are,to etc.

For this task Scikit-learn library will be used
"""

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

vector = cv.fit_transform(Movie['tags']).toarray()

vector.shape

In [None]:
"""
For getting the cosine similarity
The definition of similarity between two vectors u and v is, in fact, the ratio between their dot product and the product of their magnitudes.
By applying the definition of similarity, this will be in fact equal to 1 if the two vectors are identical,
and it will be 0 if the two are orthogonal. 
In other words, the similarity is a number bounded between 0 and 1 that tells us how much the two vectors are similar.
"""
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)   

In [None]:
similarity[0]      #Shows The Similarity between the tags 

In [None]:
def recommend(movie):                                  
    index = Movie[Movie['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:7]:
        print(Movie.iloc[i[0]].title)

In [None]:
recommend('Ramanujan')