# IMDb Movies similarity from key words

In [1]:
# Import neccessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import table

In [2]:
# Import movies table and set the index
movies = pd.read_csv('../dataset/movie_info.csv')
#movies['genre'] = [genre.split("|") for genre in movies['genre']]
#movies['key words'] = [genre.split("|") for genre in movies['key words']]

In [3]:
movies.head(10)

Unnamed: 0,title,release year,genre,key words,plot,run time /min,number of votes,rating
0,Sharknado,2013,Action|Adventure|Comedy|Horror|Sci-Fi|Thriller,school-bus|chainsaw|psychotronic-film|hurrican...,"When a freak hurricane swamps Los Angeles, nat...",86,44265,3.3
1,The Normal Heart,2014,Drama|History|Romance,gay|gay-men's-health-crisis|hiv|aids-epidemic|...,A gay activist attempts to raise H.I.V. and A....,132,31793,7.9
2,The Sunset Limited,2011,Drama,minimal-cast|dialogue-between-two-characters|s...,"Through a chance encounter, two men of opposin...",91,26733,7.4
3,Temple Grandin,2010,Biography|Drama,cattle|animal-husbandry|livestock|autism|feedi...,"A biopic of Temple Grandin, an autistic woman ...",107,25551,8.3
4,You Don't Know Jack,2010,Biography|Drama,suicide|assisted-suicide|moral-dilemma|moralit...,A look at the life and work of doctor-assisted...,134,25404,7.6
5,Game Change,2012,Biography|Drama|History,american-politics|cell-phone|down-syndrome|pre...,Governor Sarah Palin of Alaska becomes Senator...,118,20092,7.4
6,Princess Protection Program,2009,Comedy|Drama|Family,princess|dictator|teenager|protection|louisian...,A princess whose country has been invaded goes...,90,20083,5.6
7,Mean Girls 2,2011,Comedy,sexiness|dream-girl|short-skirt|miniskirt|teen...,The Plastics are back in the long-awaited foll...,96,19579,4.1
8,The Wizard of Lies,2017,Biography|Crime|Drama,financial-fraud|financier|ponzi-scheme|fraudst...,"The fall of Bernie Madoff, whose Ponzi scheme ...",133,19471,6.8
9,Sharknado 2: The Second One,2014,Action|Adventure|Comedy|Horror|Sci-Fi|Thriller,second-part|john-f.-kennedy-international-airp...,Fin and April are on their way to New York Cit...,95,17067,4.0


For the time being, let's drop the plot column so that we can focus on calculating similarity that only uses keywords.

In [4]:
movies.drop(labels=['plot'], axis='columns', inplace=True) 

In [5]:
movies.head(3)

Unnamed: 0,title,release year,genre,key words,run time /min,number of votes,rating
0,Sharknado,2013,Action|Adventure|Comedy|Horror|Sci-Fi|Thriller,school-bus|chainsaw|psychotronic-film|hurrican...,86,44265,3.3
1,The Normal Heart,2014,Drama|History|Romance,gay|gay-men's-health-crisis|hiv|aids-epidemic|...,132,31793,7.9
2,The Sunset Limited,2011,Drama,minimal-cast|dialogue-between-two-characters|s...,91,26733,7.4


### Deep dive into keywords

The preprocessing steps for text are as follows:
1. Lowercase the words
2. Take .isalpha() words
3. Remove Stop Words
4. Lemmatize

In our case, we will lowercase the words although it's not really necessary since they look all lowercase. It will be done for certainty.  
We will not take only alpha words because most of the keywords are compound words created with dashes ("-") and taking only alpha words would thus result in us discarding most of the words.  
We will remove stop words for completeness and safety although these are keywords so none should be stopwords.  
We will not lemmatize since doing do changes the meaning of certain keywords. For example, "woods" which indicates the forest, becomes "wood" the material. Or "avengers" becomes "avenger". In both cases, the first words have a meaning that is more than just the plural of the second words. So we will not take this step.

Therefore in this case, we don't need to apply any of these steps to our data.

In [6]:
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.corpora.dictionary import Dictionary

In [7]:
movies['key words']

0       school-bus|chainsaw|psychotronic-film|hurrican...
1       gay|gay-men's-health-crisis|hiv|aids-epidemic|...
2       minimal-cast|dialogue-between-two-characters|s...
3       cattle|animal-husbandry|livestock|autism|feedi...
4       suicide|assisted-suicide|moral-dilemma|moralit...
                              ...                        
3162                                    psychotronic-film
3163                      adolf-hitler|top-secret-project
3164                               hamlet|down's-syndrome
3165            outer-space|docudrama|reenactment|ukraine
3166                                           highlander
Name: key words, Length: 3167, dtype: object

In [8]:
keywords_list = [keywords.split("|") for keywords in movies['key words']]
docs = []
for keywords in keywords_list: 
    doc = []
    for keyword in keywords:
        doc.append(keyword)
    docs = docs + doc
docs

['school-bus',
 'chainsaw',
 'psychotronic-film',
 'hurricane',
 'shark',
 'shark-feature',
 'cult-film',
 'disaster-film',
 'rappelling',
 'disaster',
 'los-angeles-california',
 'father-daughter-relationship',
 'sharksploitation',
 'killer-shark',
 'spoof',
 'horror-spoof',
 'gore',
 'blood-splatter',
 'bitten-hand',
 'eaten-alive',
 'cgi',
 'fisherman',
 'absurdism',
 'flood',
 'exploding-car',
 'scar',
 'flying-fish',
 'explosion',
 'storm',
 'propane-tank',
 'helicopter',
 'california',
 'father-daughter-estrangement',
 'tiger-shark',
 'beverly-hills-california',
 'storm-surge',
 'school-bus-driver',
 'hollywood-sign',
 'waterspout',
 'surfer',
 'beach',
 'shark-attack',
 'tornado',
 'hammerhead-shark',
 'creature-feature',
 'title-spoken-by-character',
 'barstool',
 'trash',
 'gay',
 "gay-men's-health-crisis",
 'hiv',
 'aids-epidemic',
 'new-york-city',
 'gay-writer',
 'fear-of-weakness',
 'playwright',
 'suicide-attempt',
 'gay-slur',
 'brother-brother-relationship',
 'gay-kiss'

In [9]:
# Create functions for making alpha, removing stop words, and lemmatizing
def make_alpha(doc):
    # Retain alphabetic words: alpha_only
    alpha_only = [t for t in doc if t.isalpha()]    
    return(alpha_only)
def remove_stops(doc):
    no_stops = [t for t in doc if t not in stopwords.words('english')]
    return(no_stops)
def lemmatize(doc):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in doc]
    return(lemmatized)
def no_commas(doc):
    no_commas = [t for t in doc if t!=',']
    return(no_commas)

The code commented out below creates the processed_docs list which is what we use to find similarities.


In [10]:
lowercase_docs = [word_tokenize(doc.lower()) for doc in docs] #tokenize and lowercase

import csv

with open('lowercase.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(lowercase_docs)
    csvFile.close()


with open('lowercase.csv', 'r') as f:
    reader = csv.reader(f)
    lowercase_docs = list(reader)

In [11]:
lowercase_and_no_stop_docs = [remove_stops(doc) for doc in lowercase_docs]

with open('lowercase_and_no_stops.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(lowercase_and_no_stop_docs)
    csvFile.close()

In [12]:
lowercase_nostops_nocommas_docs = [no_commas(doc) for doc in lowercase_and_no_stop_docs]
processed_docs = lowercase_nostops_nocommas_docs

with open('processed_docs.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerows(processed_docs)
    csvFile.close()

The code below is all the relevent code for the model in one cell.

In [13]:
with open('processed_docs.csv', 'r') as f:
    reader = csv.reader(f)
    processed_docs = list(reader)
    processed_docs = processed_docs[0::2] # get rid of empty lists

dictionary = Dictionary(processed_docs) # create a dictionary of words from our keywords

corpus = [dictionary.doc2bow(doc) for doc in processed_docs] #create corpus where the corpus is a bag of words for each document

from gensim.models.tfidfmodel import TfidfModel

tfidf = TfidfModel(corpus) #create tfidf model of the corpus

import gensim
from gensim.similarities import Similarity
from gensim.similarities import MatrixSimilarity

# Create the similarity data structure. This is the most important part where we get the similarities between the movies.
sims = MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))

In [14]:
dictionary = Dictionary(processed_docs) # create a dictionary of words from our keywords

# Print out first 10 words:
for i in range(len(dictionary))[0:10]:
    print(i, dictionary[i])

0 school-bus
1 psychotronic-film
2 shark
3 cult-film
4 rappelling
5 los-angeles-california
6 sharksploitation
7 spoof
8 gore
9 bitten-hand


In [15]:
#create corpus where the corpus is a bag of words for each document
corpus = [dictionary.doc2bow(doc) for doc in processed_docs] 

In [16]:
# Create the defaultdict: total_word_count
# This dictionary contains every word ID and its corresponding number of times it appears in the corpus
from collections import defaultdict
import itertools
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# Create a sorted list from the defaultdict: sorted_word_count
sorted_word_count = sorted(total_word_count.items(), key=lambda w: w[1], reverse=True)

# Print the top 20 words across all documents alongside the count
for word_id, word_count in sorted_word_count[:20]:
    print(dictionary.get(word_id), word_count)

christmas 133
bare-chested-male 124
f-rated 121
father-son-relationship 100
murder 96
female-nudity 84
party 79
boy 69
based-on-true-story 67
husband-wife-relationship 63
mother-daughter-relationship 63
gift 61
dinner 59
reenactment 59
sex-scene 58
male-nudity 56
female-protagonist 55
father-daughter-relationship 55
mother-son-relationship 54
brother-brother-relationship 49


Clearly a lot of christmas themed movies and relationships in these keywords.

##### 1st Model: Jaccard Similarity Based on Word Counts

Jaccard similarity(df) : intersection of 2 sets divided by the union of those sets. 

The idea of this model:  

***# of common keywords between two movies / # of unique keywords in the union of two movies’ keywords***
 
Then we rank the movies by their similarities and the user can query the top K results for each movie.

In [17]:
def get_jaccard_sim(str1, str2):
    a = set(str1.split('|'))
    b = set(str2.split('|'))
    c = a.intersection(b)
    return(float(len(c)) / (len(a) + len(b) - len(c)))

def keyword_string(movie):
    movie = movies[movies.title==movie]
    keyword_string = movie['key words'].iloc[0]
    
    return(keyword_string)

def get_jaccard_sim2(movie1, movie2):
    keywords1 = keyword_string(movie1)
    keywords2 = keyword_string(movie2)
    
    return(get_jaccard_sim(keywords1, keywords2))

In [18]:
def jaccard_recommender(movie_title, K=5):
    if (len(movies[movies['title']==movie_title])==0):
        print("Sorry, we don't have this movie in our database. But we will take it into consideration in the future, thank you!")
    else:
        movie = movies[movies.title==movie_title]
        keyword_string = movie['key words'].iloc[0]

        jaccards = []
        for movie in movies['key words']:
            jaccards.append(get_jaccard_sim(keyword_string, movie))
        jaccards = pd.Series(jaccards)
        jaccards_index = jaccards.nlargest(K+1).index
        matches = movies.loc[jaccards_index]
        for match,score in zip(matches['title'][1:],jaccards[jaccards_index][1:]) :
            print(match,score )

##### 2nd Model: Cosine Similarity Based on Word Counts

1. use CountVectorizer, Compute word counts for every movie’s keywords (word vectors)

2. Use scikit-learn library, Compute cosine similarity between any word vectors 

Like the 1st model, then we rank the movies by their similarities and the user can query the top K results for each movie.

In [19]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_sim(*strs):
    vectors = [t for t in get_vectors1(*strs)]
    return(cosine_similarity(vectors))

def get_vectors1(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return(vectorizer.transform(text).toarray())

def get_vectors2(text):
    vectorizer = CountVectorizer(text)
    X = vectorizer.fit_transform(text)
    return(X.toarray())

In [20]:
vectors = get_vectors2(movies['key words'].tolist())

In [21]:
def cosine_recommender(movie_title, K=5):
    if (len(movies[movies['title']==movie_title])==0):
        print("Sorry, we don't have this movie in our database. But we will take it into consideration in the future, thank you!")
    else:
        movie_index = movies[movies.title == movie_title].index.values[0]

        cosines = []
        for i in range(len(vectors)):
            vector_list = [vectors[movie_index], vectors[i]]
            cosines.append(cosine_similarity(vector_list)[0,1])

        cosines = pd.Series(cosines)
        index = cosines.nlargest(K+1).index

        matches = movies.loc[index]
        for match,score in zip(matches['title'][1:],cosines[index][1:]):
            print(match,score )

**Quick comparison for a movie based on different models**

Let's use 'Mean Girls 2' as an example:

In [22]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

which movie you want to search? Mean Girls 2
How many most similarity movies you want to display? 10
Sorority Wars 0.23255813953488372
The Christmas Hope 0.1
Camp Rock 2: The Final Jam 0.0967741935483871
Five 0.09090909090909091
Jack Irish: Black Tide 0.06896551724137931
Wedding March 2: Resorting to Love 0.06896551724137931
Journey Into Dyslexia 0.06896551724137931
Tangled 0.06896551724137931
Petite fille 0.06896551724137931
Love at First Glance 0.06818181818181818


In [23]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

which movie you want to search? Mean Girls 2
How many most similarity movies you want to display? 10
Sorority Wars 0.23255813953488372
The Christmas Hope 0.1
Camp Rock 2: The Final Jam 0.0967741935483871
Five 0.09090909090909091
Jack Irish: Black Tide 0.06896551724137931
Wedding March 2: Resorting to Love 0.06896551724137931
Journey Into Dyslexia 0.06896551724137931
Tangled 0.06896551724137931
Petite fille 0.06896551724137931
Love at First Glance 0.06818181818181818


Looks good. They all recommend similar appropriate movies for the same movie, with slight differences in recommendation.

What if we input some movie that doesn't exist in the dataset? say input "I am not a movie"

In [24]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

which movie you want to search? I am not a movie
How many most similarity movies you want to display? 10
Sorry, we don't have this movie in our database. But we will take it into consideration in the future, thank you!


In [25]:
movie_title = str(input("which movie you want to search? "))
K = int(input("How many most similarity movies you want to display? "))

jaccard_recommender(movie_title, K)

which movie you want to search? I am not a movie
How many most similarity movies you want to display? 10
Sorry, we don't have this movie in our database. But we will take it into consideration in the future, thank you!
