Let's load the data now.

In [219]:
import pandas as pd 
import numpy as np 
df1=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_credits.csv')
df2=pd.read_csv('../input/tmdb-movie-metadata/tmdb_5000_movies.csv')
links = pd.read_csv("../input/the-movies-dataset/links.csv")
movie_md = pd.read_csv("../input/the-movies-dataset/movies_metadata.csv")
movie_md = movie_md[["id", "imdb_id", "title"]]
movie_md["id"] = movie_md["id"].astype(str)
movie_md.head(2)

In [220]:
links.rename(columns={'tmdbId': 'id',},
          inplace=True, errors='raise')
links.head(2)

In [221]:
df2[df2['id'] == 862]

The first dataset contains the following features:-

* movie_id - A unique identifier for each movie.
* cast - The name of lead and supporting actors.
* crew - The name of Director, Editor, Composer, Writer etc.

The second dataset has the following features:- 

* budget - The budget in which the movie was made.
* genre - The genre of the movie, Action, Comedy ,Thriller etc.
* homepage - A link to the homepage of the movie.
* id - This is infact the movie_id as in the first dataset.
* keywords - The keywords or tags related to the movie.
* original_language - The language in which the movie was made.
* original_title - The title of the movie before translation or adaptation.
* overview - A brief description of the movie.
* popularity - A numeric quantity specifying the movie popularity.
* production_companies - The production house of the movie.
* production_countries - The country in which it was produced.
* release_date - The date on which it was released.
* revenue - The worldwide revenue generated by the movie.
* runtime - The running time of the movie in minutes.
* status - "Released" or "Rumored".
* tagline - Movie's tagline.
* title - Title of the movie.
* vote_average -  average ratings the movie recieved.
* vote_count - the count of votes recieved.

Let's join the two dataset on the 'id' column


In [222]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')
df2["id"] = df2["id"].astype(str)
df2 = df2.merge(movie_md, on = "id")
df2.head(3)

In [223]:
# df2 = df2.drop_duplicates()
df2.shape

In [224]:
df2[['genres', 'imdb_id', 'tittle','original_title', 'id']].head(4)

In [225]:
C= df2['vote_average'].mean()
C

So, the mean rating for all the movies is approx 6 on a scale of 10.The next step is to determine an appropriate value for m, the minimum votes required to be listed in the chart. We will use 90th percentile as our cutoff. In other words, for a movie to feature in the charts, it must have more votes than at least 90% of the movies in the list.

In [226]:
m= df2['vote_count'].quantile(0.9)
m

Now, we can filter out the movies that qualify for the chart 

In [227]:
q_movies = df2.copy().loc[df2['vote_count'] >= 1000]
q_movies.shape

In [228]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [229]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

Finally, let's sort the DataFrame based on the score feature and output the title, vote count, vote average and weighted rating or score of the top 10 movies.

In [230]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['tittle', 'vote_count', 'vote_average', 'score']].head(10)

In [231]:
df2['overview'].head(5)

In [232]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

In [233]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [234]:
#Construct a reverse map of indices and movie titles
# indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()
indices = df2[["tittle", "id","imdb_id"]].drop_duplicates()
# indices["id"] = df2["id"]
# indices.id
# indices.insert(1, "movie_id", df2.id)
indices.head()

In [235]:
indices.shape

In [236]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices.index[indices['tittle'] == title].tolist()[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2[['tittle', "imdb_id"]].iloc[movie_indices]

In [237]:
get_recommendations("Benji")

In [238]:
get_recommendations('The Final Destination')

In [239]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [240]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [241]:
# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [242]:
# Define new director, cast, genres and keywords features that are in a suitable form.
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [243]:
# Print the new features of the first 3 films
df2[['tittle', 'cast', 'director', 'keywords', 'genres']].head(3)

In [244]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [245]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)

In [246]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [247]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])

In [248]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [249]:
# Reset index of our main DataFrame and construct reverse mapping as before
df2 = df2.reset_index()
# indices = pd.Series(df2.index, index=df2['tittle'])
indices = df2[["tittle", "id","imdb_id"]].drop_duplicates()

In [317]:
get_recommendations("X-Men: First Class", cosine_sim2)

In [312]:
get_recommendations("Spider-Man 3", cosine_sim2)

In [253]:
indices.to_csv('results.csv')

In [254]:
out = pd.read_csv("./results.csv")

In [261]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations_id(imbdId, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices.index[indices['imdb_id'] == imbdId].tolist()[0]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df2[['tittle', "imdb_id"]].iloc[movie_indices]

In [1]:
get_recommendations_id("tt0241527", cosine_sim2)

NameError: name 'get_recommendations_id' is not defined

In [272]:
out = pd.read_csv("./results.csv", index_col=False)
out.head(4)

In [293]:
# out = out[["tittle", "imdb_id"]]
# out.head
out.rename(columns= {'Unnamed: 0': "index"},inplace= True)
out = out[["tittle", "imdb_id"]]
out.reset_index(drop=True)
out.set_index('imdb_id', inplace = True)

out.head(2)

In [294]:
import json
js_index = out.to_json(orient = 'records')
titles = json.loads(js_index)["tittle"]
new_dict = dict([(value, key) for key, value in titles.items()])
# # new_dict
# with open('output.json', 'w') as outfile:
#     json.dump(new_dict, outfile)

In [299]:
import json
js_index = out.to_json(orient = 'columns')
titles = json.loads(js_index)["tittle"]
new_dict = dict([(value, key) for key, value in titles.items()])
with open('output_id.json', 'w') as outfile:
    json.dump(titles, outfile)

# titles


In [300]:
import pickle

In [305]:
with open('cosine_sim.pkl', 'wb') as fp:
    pickle.dump(cosine_sim, fp)
with open('cosine_sim.pkl', 'rb') as fp:
    banana = pickle.load(fp)

In [306]:
get_recommendations_id("tt1074638", banana)

In [308]:
out.head()