In [46]:
import json
from functools import partial

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [122]:
# Reading movie dataset from csv files
credit = pd.read_csv('../DataSet/tmdb_5000_credits.csv')
movie = pd.read_csv('../DataSet/tmdb_5000_movies.csv')

credit.columns = ['id', 'tittle', 'cast', 'crew']
all_movies = movie.merge(credit, on='id')  # merging movie and credit data set based on ID
del movie
del credit


In [48]:
# Change the given string format of release_date to datetime format
all_movies["release_date"] = pd.to_datetime(all_movies['release_date'])
all_movies['release_year'] = all_movies['release_date'].dt.year
all_movies['release_month'] = all_movies['release_date'].dt.month_name()
del all_movies["release_date"]

In [49]:
# Change all columns that have json string into json format and
# eliminate ID since ID is not necessary for users

json_columns = {'cast', 'crew', 'genres', 'keywords', 'production_countries',
                'production_companies', 'spoken_languages'}

In [126]:
for c in json_columns:
    all_movies[c] = all_movies[c].apply(json.loads)
    if c != "crew":  # We need other information more than just the name
        all_movies[c] = all_movies[c].apply(lambda row: [i["name"] for i in row])

In [133]:
# Create director writer and producer columns from crew column of the data set
def get_role(role, row):
    person_name = [i['name'] for i in row if i['job'] == role]
    person_name[0] if len(person_name) else np.nan
    return person_name[0] if len(person_name) else np.nan




In [128]:
all_movies["director"] = all_movies["crew"].apply(partial(get_role, "Director"))
all_movies["writer"] = all_movies["crew"].apply(partial(get_role, "Writer"))
all_movies["producer"] = all_movies["crew"].apply(partial(get_role, "Producer"))
del all_movies["crew"]
all_movies[["director", "writer", "producer"]]

Unnamed: 0,director,writer,producer
0,James Cameron,James Cameron,James Cameron
1,Gore Verbinski,,Jerry Bruckheimer
2,Sam Mendes,,Barbara Broccoli
3,Christopher Nolan,,Charles Roven
4,Andrew Stanton,,Colin Wilson
...,...,...,...
4798,Robert Rodriguez,,Robert Rodriguez
4799,Edward Burns,Edward Burns,Edward Burns
4800,Scott Smith,Martha Williamson,Harvey Kahn
4801,Daniel Hsia,Daniel Hsia,


In [53]:
# Create profit column
# all_movies["profit"] = all_movies["revenue"] - all_movies["budget"]

# Fill the missing values with the most frequent value using fillna method from pandas library
for col in ["runtime", "release_year", "release_month"]:
    all_movies[col] = all_movies[col].fillna(all_movies[col].mode().iloc[0])


In [54]:
all_movies['title']

0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4798                                 El Mariachi
4799                                   Newlyweds
4800                   Signed, Sealed, Delivered
4801                            Shanghai Calling
4802                           My Date with Drew
Name: title, Length: 4803, dtype: object

In [55]:
all_movies[['title','cast']]

Unnamed: 0,title,cast
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weave..."
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley, ..."
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R..."
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman, A..."
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton,..."
...,...,...
4798,El Mariachi,"[Carlos Gallardo, Jaime de Hoyos, Peter Marqua..."
4799,Newlyweds,"[Edward Burns, Kerry Bishé, Marsha Dietlein, C..."
4800,"Signed, Sealed, Delivered","[Eric Mabius, Kristin Booth, Crystal Lowe, Geo..."
4801,Shanghai Calling,"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan..."


In [56]:
lemmatizer = WordNetLemmatizer()  # Performing a word formation process by usingWordNetLemmatizer from NLTK library

In [57]:
# Data pre-processing
def preprocess_sentences(text):
    text = text.lower()
    temp_sentences = []
    words = nltk.word_tokenize(text)
    tags = nltk.pos_tag(words)
    for i, word in enumerate(words):
        if tags[i][1] in VERB_CODES:
            lemma_sentences = lemmatizer.lemmatize(word, 'v')
        else:
            lemma_sentences = lemmatizer.lemmatize(word)
        if lemma_sentences not in stop_words and lemma_sentences.isalpha():
            temp_sentences.append(lemma_sentences)

    final_sentences = ' '.join(temp_sentences)
    final_sentences = final_sentences.replace("n't", " not")
    final_sentences = final_sentences.replace("'m", " am")
    final_sentences = final_sentences.replace("'s", " is")
    final_sentences = final_sentences.replace("'re", " are")
    final_sentences = final_sentences.replace("'ll", " will")
    final_sentences = final_sentences.replace("'ve", " have")
    final_sentences = final_sentences.replace("'d", " would")

    return final_sentences

In [58]:
# Pre-processing the movie overview(plot summary) by using NTLK processing techniques

movie_data['overview'] = movie_data['overview'].fillna('')
movie_data['overview']
stop_words = set(stopwords.words('english'))
VERB_CODES = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
# stop_words = TfidfVectorizer(stop_words='english')
movie_data['overview_preprocessed'] = movie_data['overview'].apply(preprocess_sentences)
movie_data.head(2)


Unnamed: 0,title,overview,overview_preprocessed
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",century paraplegic marine dispatch moon pandor...
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",captain barbossa long believe dead come back l...


In [33]:
# CosineSimilarity algorithm is used to calculate the similarity scores of movies
movie_data['overview_preprocessed'] = movie_data['overview_preprocessed'].fillna('')
# Vectorizing pre-processed the movie overview using TF-IDF Vecorization
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movie_data['overview_preprocessed'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
indices = pd.Series(movie_data.index, index=movie_data['title']).drop_duplicates()
tfidf_matrix.shape

(4803, 16859)

In [34]:
# Cosine Similarity algorithm to find similar movies with additional features
features = ['cast', 'genres']
top_num = 5  # leave only top values for some features
for feature in features:
    movie_data[feature] = movie_data[feature].apply(lambda x: x[:top_num] if isinstance(x, list) else [])


In [35]:
# Remove spaces for some features
def data_cleaning(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

In [36]:
# Clean the data
features = ['cast', 'genres']

for feature in features:
    movie_data[feature] = movie_data[feature].apply(data_cleaning)


In [37]:
# Create the combined features column to the dataset
def combined_features(x):
    return ' '.join([' '.join(x[f]) if isinstance(x[f], list) else str(x[f]) for f in features])


In [38]:
movie_data["combined_features"] = movie_data.apply(combined_features, axis=1)
count = CountVectorizer(stop_words='english')  # CountVectorizer will be used to remove stop_words
count_vector = count.fit_transform(movie_data['combined_features'])
cosine_sim2 = cosine_similarity(count_vector, count_vector)
movie_df = movie_data.reset_index()
indices = pd.Series(movie_df.index, index=movie_df['title'])
count_vector.shape


(4803, 9685)

In [103]:
# Building Recommendation Function
def get_recommendations(title, cosine_sim=cosine_sim):
    try:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:10]
        movie_indices = [i[0] for i in sim_scores]
        movie_similarity = [i[1] for i in sim_scores]

        return pd.DataFrame(zip(movie_data['title'].iloc[movie_indices], movie_similarity),
                            columns=["Title", "Similarity Score"])
    except KeyError:
        print("Invalid Movie Name. Please Type the Correct Movie Name.")
        return pd.DataFrame()

In [101]:
get_recommendations("Spider-Man")

Unnamed: 0,Title,Similarity Score
0,Arachnophobia,0.256248
1,The Amazing Spider-Man,0.227041
2,Spider-Man 3,0.223592
3,The Amazing Spider-Man 2,0.214733
4,Spider-Man 2,0.203867


In [104]:
get_recommendations("Spirited Away",cosine_sim2)

Unnamed: 0,Title,Similarity Score
0,Pokémon: Spell of the Unknown,0.471405
1,The Polar Express,0.444444
2,How to Train Your Dragon,0.444444
3,Epic,0.444444
4,Arthur and the Invisibles,0.444444
5,Thunder and the House of Magic,0.444444
6,Return to Never Land,0.444444
7,Shrek Forever After,0.421637
8,Shrek the Third,0.421637


In [105]:
get_recommendations("The Avengers")

Unnamed: 0,Title,Similarity Score
0,Avengers: Age of Ultron,0.159883
1,Thank You for Smoking,0.117552
2,Timecop,0.115029
3,The Art of War,0.108845
4,This Thing of Ours,0.105487
5,Night at the Museum: Secret of the Tomb,0.104731
6,Wall Street: Money Never Sleeps,0.102638
7,Are We There Yet?,0.102191
8,The Corruptor,0.101023
