In [1]:
import pandas as pd
import numpy as np

In [2]:
path = r'D:\C++\PYTHON\PYTHON_PRACTICE\recommendation_system\3405_6663_compressed_movies_metadata.csv\movies_metadata.csv'

In [3]:
metadata = pd.read_csv(path,low_memory=False)

In [4]:
metadata.head()
metadata.shape
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
# SIMPLE RECOMMENDER
# Considering weighte rating to tackle voting baises

In [6]:
# calculate the mean of vote average column
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [7]:
# calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [8]:
# Filtering the movies with number of votes greater than m 
q_movies = metadata.copy().loc[metadata['vote_count']>=m]
q_movies.shape

(4555, 24)

In [9]:
# Calculated the weighted rating for the qualified movies
def weighted_rating(x,m=m,C=C):
    v = x['vote_count']
    R = x['vote_average']
    #caculation of weighted average
    weighted_average = (v/(v+m)*R)+(m/(m+v)*C)
    return weighted_average

In [10]:
# new score for the qualifed movies
q_movies['score'] = q_movies.apply(weighted_rating,axis=1)

In [11]:
# sorting the data based on the wrighted scores
q_movies = q_movies.sort_values('score', ascending=False)

In [12]:
# printinng the top 10 movies
q_movies[['title', 'vote_count', 'vote_average','score']].head(10)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [13]:
# CONTENT BASED RECOMMENDER
""" This recommender system recommend movie that are similar to 
a aprticular movie. To achieve this, cosine simlarity is computed
based on the plot description and recommend movies based on that
similarity score threshold."""

' This recommender system recommend movie that are similar to \na aprticular movie. To achieve this, cosine simlarity is computed\nbased on the plot description and recommend movies based on that\nsimilarity score threshold.'

In [14]:
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [15]:
# We have to find the similarity between the movie description
# using natural language processing

In [16]:
'''the TF-IDF score is the frequency of a word occurring in a 
document, down-weighted by the number of documents in which it
occurs. This is done to reduce the importance of words that 
frequently occur in plot overviews and, therefore,
their significance in computing the final similarity score'''

'the TF-IDF score is the frequency of a word occurring in a \ndocument, down-weighted by the number of documents in which it\noccurs. This is done to reduce the importance of words that \nfrequently occur in plot overviews and, therefore,\ntheir significance in computing the final similarity score'

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

#  for removing the stop words from the documents
tfidf = TfidfVectorizer(stop_words='english')

# Replace NA with an empty string
metadata['overview'] = metadata['overview'].fillna('')

# Constructing the required TF_IDF by fitting and 
# transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

tfidf_matrix.shape

(45466, 75827)

In [18]:
'''There are 75827 different vocabularies or words in the data
 of 45466 movies.
 
 Somilarity scores can be calculated using metrics like manhattan
 , euclidean, Pearson, cosine similarity
 
 Since we have used the TF-IDF vectorizer, calculating the 
 dot product between each vector will directly give you the 
 cosine similarity score. Therefore, you will use sklearn's 
 linear_kernel() 
 instead of cosine_similarities() since it is faster'''

"There are 75827 different vocabularies or words in the data\n of 45466 movies.\n \n Somilarity scores can be calculated using metrics like manhattan\n , euclidean, Pearson, cosine similarity\n \n Since we have used the TF-IDF vectorizer, calculating the \n dot product between each vector will directly give you the \n cosine similarity score. Therefore, you will use sklearn's \n linear_kernel() \n instead of cosine_similarities() since it is faster"

In [19]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
#cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)
# too much data, not doing this step, but continuing the other
print('alll doone')

alll doone


In [20]:
tfidf_matrix.shape

(45466, 75827)

In [21]:
a = tfidf_matrix[0:150,0:150]

In [22]:
a.shape

(150, 150)

In [28]:
# this function takes a movie title and outputs the 10 most similar movie

# Reverse map of indices and movie title
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates


In [31]:
'''Steps for recommendation function 
1. Given the title, get the index of the movie
2. Get the cosine similarity scores of that movie with the other movies
3. Create a list with the index and the similarity score
4. Sort the list.
5. Get the most similar movies'''

'Steps for recommendation function \n1. Given the title, get the index of the movie\n2. Get the cosine similarity scores of that movie with the other movies\n3. Create a list with the index and the similarity score\n4. Sort the list.\n5. Get the most similar movies'

In [None]:
# Recommnedation function 
def get_recommendation(title, cosine_similarity_matrix = cosine_sim):
    
    # get the index of the title
    idx = indices[title]
    
    #Get the pairwise similarity score of all the movies with that movie
    sim_score = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity score
    sim_score = sorted(sim_score, key=lambda x:x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    movie_indices = [i[0] for i in sim_score]
    
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [32]:
# Credits, Genre ,Keywords based recommender
# Building a recommendation system based on top 3 actos, director,
# related genre and movie plot keywords
# Loading the cast, crew data in the current dataset

In [37]:
path = r'D:\C++\PYTHON\PYTHON_PRACTICE\recommendation_system\3405_6663_compressed_movies_metadata.csv'
import os
file = os.path.join(path, 'credits.csv')
credits = pd.read_csv(file)
file2 = os.path.join(path, 'keywords.csv')
keywords = pd.read_csv(file2)

In [42]:
keywords.info()
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [40]:
# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

In [44]:
# Convert ids to integars
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [45]:
# Merge credits and keywords into metadata dataframe
metadata = metadata.merge(credits,on='id')
metadata = metadata.merge(keywords, on='id')

In [47]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords'],
      dtype='object')

In [53]:
metadata.iloc[1,:]['cast']

"[{'cast_id': 1, 'character': 'Alan Parrish', 'credit_id': '52fe44bfc3a36847f80a7c73', 'gender': 2, 'id': 2157, 'name': 'Robin Williams', 'order': 0, 'profile_path': '/sojtJyIV3lkUeThD7A2oHNm8183.jpg'}, {'cast_id': 8, 'character': 'Samuel Alan Parrish / Van Pelt', 'credit_id': '52fe44bfc3a36847f80a7c99', 'gender': 2, 'id': 8537, 'name': 'Jonathan Hyde', 'order': 1, 'profile_path': '/7il5D76vx6QVRVlpVvBPEC40MBi.jpg'}, {'cast_id': 2, 'character': 'Judy Sheperd', 'credit_id': '52fe44bfc3a36847f80a7c77', 'gender': 1, 'id': 205, 'name': 'Kirsten Dunst', 'order': 2, 'profile_path': '/wBXvh6PJd0IUVNpvatPC1kzuHtm.jpg'}, {'cast_id': 24, 'character': 'Peter Shepherd', 'credit_id': '52fe44c0c3a36847f80a7ce7', 'gender': 0, 'id': 145151, 'name': 'Bradley Pierce', 'order': 3, 'profile_path': '/j6iW0vVA23GQniAPSYI6mi4hiEW.jpg'}, {'cast_id': 10, 'character': 'Sarah Whittle', 'credit_id': '52fe44bfc3a36847f80a7c9d', 'gender': 1, 'id': 5149, 'name': 'Bonnie Hunt', 'order': 4, 'profile_path': '/7spiVQwmr

In [54]:
# From the cast, crew and keywords, we need to extract the 
# most important actors, director and keywords associated with 
# the movie

In [55]:
from ast import literal_eval

In [56]:
features = ['cast', 'crew', 'keywords','genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [58]:
metadata.iloc[1,:]['cast']

[{'cast_id': 1,
  'character': 'Alan Parrish',
  'credit_id': '52fe44bfc3a36847f80a7c73',
  'gender': 2,
  'id': 2157,
  'name': 'Robin Williams',
  'order': 0,
  'profile_path': '/sojtJyIV3lkUeThD7A2oHNm8183.jpg'},
 {'cast_id': 8,
  'character': 'Samuel Alan Parrish / Van Pelt',
  'credit_id': '52fe44bfc3a36847f80a7c99',
  'gender': 2,
  'id': 8537,
  'name': 'Jonathan Hyde',
  'order': 1,
  'profile_path': '/7il5D76vx6QVRVlpVvBPEC40MBi.jpg'},
 {'cast_id': 2,
  'character': 'Judy Sheperd',
  'credit_id': '52fe44bfc3a36847f80a7c77',
  'gender': 1,
  'id': 205,
  'name': 'Kirsten Dunst',
  'order': 2,
  'profile_path': '/wBXvh6PJd0IUVNpvatPC1kzuHtm.jpg'},
 {'cast_id': 24,
  'character': 'Peter Shepherd',
  'credit_id': '52fe44c0c3a36847f80a7ce7',
  'gender': 0,
  'id': 145151,
  'name': 'Bradley Pierce',
  'order': 3,
  'profile_path': '/j6iW0vVA23GQniAPSYI6mi4hiEW.jpg'},
 {'cast_id': 10,
  'character': 'Sarah Whittle',
  'credit_id': '52fe44bfc3a36847f80a7c9d',
  'gender': 1,
  'id': 5

In [59]:
# Creating a metadata soup
# Which is a string that contains all the metadata that i want to 
# fees in the vectoriser, namely actors, directors, keywords

In [72]:
# Extracting the rewuired fields from the data
import numpy as np

In [76]:
# Get the director name from the crew feature, 
# if director is not listed return nan
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [74]:
def get_list(x):
    if isinstance(x,list):
        names = [i['name'] for i in x]
        if len(names) >3:
            names = names[:3]
        return names
    return []

In [77]:
metadata['director'] = metadata['crew'].apply(get_director)

In [80]:
features = ['cast','keywords','genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [81]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"


In [82]:
#The next step would be to convert the names and keyword 
#instances into lowercase and strip all the spaces between them.

In [84]:
# Convert string into lowercase and strip spaces
def clean_data(x):
    if isinstance(x,list):
        return [str.lower(i.replace(" ","")) for i in x]
    else:
        if isinstance(x,str):
            return str.lower(x.replace(" ",""))
        else:
            return ''

In [85]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [86]:
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[tomhanks, timallen, donrickles]",johnlasseter,"[jealousy, toy, boy]","[animation, comedy, family]"
1,Jumanji,"[robinwilliams, jonathanhyde, kirstendunst]",joejohnston,"[boardgame, disappearance, basedonchildren'sbook]","[adventure, fantasy, family]"
2,Grumpier Old Men,"[waltermatthau, jacklemmon, ann-margret]",howarddeutch,"[fishing, bestfriend, duringcreditsstinger]","[romance, comedy]"


In [87]:
# Crate soup that will join all the requied columns by space
# The output of this function will be fed into vector model

In [88]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [89]:
metadata['soup'] = metadata.apply(create_soup,axis=1)

In [90]:
metadata[['soup']].head()

Unnamed: 0,soup
0,jealousy toy boy tomhanks timallen donrickles ...
1,boardgame disappearance basedonchildren'sbook ...
2,fishing bestfriend duringcreditsstinger walter...
3,basedonnovel interracialrelationship singlemot...
4,baby midlifecrisis confidence stevemartin dian...


In [92]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(metadata['soup'])

In [93]:
count_matrix.shape

(46628, 73881)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_2 = cosine_similarity(count_matrix,count_matrix)

In [None]:
# rest index
metadata = metadatadata.reset_index()
indicies = pd.Series(metadata.index , index=metadata['title'])

get_recommendation('The Drk Knight Rises', cosine_sim_2)