# Movie Recommendation

## IMDB movie recommendation notebook

#### Author/s: Ruturaj Kiran Vaidya
(Add you name if you edit/improve)

In [41]:
# Imports

# Genral

import pandas as pd
import numpy as np
import random

# graphs

from plotly import graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Model
# sklearn
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

In [2]:
train = pd.read_json("MovieDetails.json")
test = pd.read_json("TestMovieDetails.json")

In [3]:
train.keys()

Index(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director',
       'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster',
       'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'totalSeasons',
       'Season', 'Episode', 'seriesID'],
      dtype='object')

In [4]:
test.keys()

Index(['Title', 'Year', 'Rated', 'Released', 'Runtime', 'Genre', 'Director',
       'Writer', 'Actors', 'Plot', 'Language', 'Country', 'Awards', 'Poster',
       'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'imdbID', 'Type',
       'DVD', 'BoxOffice', 'Production', 'Website', 'Response', 'totalSeasons',
       'Error', 'Season', 'Episode', 'seriesID'],
      dtype='object')

In [5]:
del test["Error"]

In [6]:
test.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID
0,The Guardian,2006,PG-13,29 Sep 2006,139 min,"Action, Adventure, Drama",Andrew Davis,Ron L. Brinkerhoff,"Kevin Costner, Ashton Kutcher, Sela Ward, Meli...",A high school swim champion with a troubled pa...,...,movie,23 Jan 2007,"$54,983,983",Buena Vista Pictures,,True,,,,
1,The Dresser,1983,PG,06 Dec 1983,118 min,Drama,Peter Yates,"Ronald Harwood (screenplay), Ronald Harwood (b...","Albert Finney, Tom Courtenay, Edward Fox, Zena...",An effeminate personal assistant of a deterior...,...,movie,06 Apr 2004,,Sony Pictures Entertainment,,True,,,,
2,Lackawanna Blues,2005,PG-13,12 Feb 2005,95 min,"Drama, Music",George C. Wolfe,"Ruben Santiago-Hudson (play), Ruben Santiago-H...","S. Epatha Merkerson, Marcus Carl Franklin, Yas...","In a story fueled by rhythm and blues, a young...",...,movie,23 Aug 2005,,HBO Films,,True,,,,
3,Darling Lili,1970,G,24 Jun 1970,136 min,"Comedy, Drama, Musical, Romance, War",Blake Edwards,"Blake Edwards, William Peter Blatty","Julie Andrews, Rock Hudson, Jeremy Kemp, Lance...","Set during World War I, this movie is a cute s...",...,movie,25 Oct 2005,,Paramount,,True,,,,
4,Acrobatty Bunny,1946,APPROVED,29 Jun 1946,8 min,"Family, Animation, Short",Robert McKimson,Warren Foster (story),Mel Blanc,When the circus arrives they put the lion's ca...,...,movie,,,,,True,,,,


In [7]:
train.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID
0,Futz,1969,NOT RATED,16 Nov 1969,92 min,Comedy,Tom O'Horgan,"Rochelle Owens (play), Joseph Stefano (screenp...","Seth Allen, John Bakos, Mari-Claire Charba, Pe...",Sexual satire about a young farmer who has had...,...,movie,,,,,True,,,,
1,Stanley & Iris,1990,PG-13,09 Feb 1990,104 min,"Drama, Romance",Martin Ritt,"Pat Barker (novel), Harriet Frank Jr. (screenp...","Jane Fonda, Robert De Niro, Swoosie Kurtz, Mar...",A struggling widow falls in love with an illit...,...,movie,13 Jan 2004,,MGM Home Entertainment,,True,,,,
2,Puss in Boots,1988,G,10 Jun 1988,96 min,"Fantasy, Musical, Family",Eugene Marner,"Charles Perrault (fairy tale ""Le chat botté""),...","Christopher Walken, Jason Connery, Carmela Mar...",A cat belonging to a poor miller's son thinks ...,...,movie,09 Aug 2005,,MGM,,True,,,,
3,#DUPE#,1972,,,,,,,,,...,movie,,,,,True,,,,
4,Who Done It?,1949,,03 Mar 1949,16 min,"Comedy, Short",Edward Bernds,"Edward Bernds (story), Edward Bernds (screenplay)","Shemp Howard, Larry Fine, Moe Howard, Christin...",The stooges are private detectives looking for...,...,movie,,,,,True,,,,


In [8]:
combined = pd.concat([train, test], ignore_index=True)
movies = combined[["imdbID", "Title", "Ratings", "imdbRating", "imdbVotes", "Language", "Genre"]].dropna()
movies.drop_duplicates(subset=['imdbID'], keep=False, inplace=True)

In [9]:
movies.tail()

Unnamed: 0,imdbID,Title,Ratings,imdbRating,imdbVotes,Language,Genre
7029,tt0258760,Lammbock,"[{'Source': 'Internet Movie Database', 'Value'...",7.3,10180,German,Comedy
7030,tt0146455,Babylon 5: A Call to Arms,"[{'Source': 'Internet Movie Database', 'Value'...",7.2,4976,English,"Action, Adventure, Drama, Sci-Fi"
7031,tt0432047,Sarkar,"[{'Source': 'Internet Movie Database', 'Value'...",7.7,14199,"Hindi, Marathi","Crime, Drama"
7032,tt0109515,Cyber Tracker,"[{'Source': 'Internet Movie Database', 'Value'...",3.5,971,English,"Action, Sci-Fi"
7033,tt0485161,Totally Awesome,"[{'Source': 'Internet Movie Database', 'Value'...",5.2,1625,English,Comedy


In [10]:
movies.shape

(6914, 7)

### Encoding genres

In [11]:
# split genres and printing
genres = set([j for i in [i.split(",") for i in movies['Genre']] for j in i])
print(genres)
print(len(genres))

{' Animation', 'Fantasy', 'Game-Show', 'Sci-Fi', ' Horror', 'Musical', 'Adult', 'War', 'Action', ' Drama', 'Music', ' Fantasy', 'Crime', ' Sci-Fi', ' Mystery', ' War', 'Drama', 'Horror', ' Talk-Show', ' Romance', 'Romance', ' Music', ' Film-Noir', ' Action', ' Game-Show', ' Biography', 'Short', ' Comedy', 'Family', 'Film-Noir', 'Adventure', 'Biography', 'News', ' Family', 'Documentary', ' Musical', ' History', 'Western', ' Reality-TV', ' Sport', ' Crime', ' Western', ' Documentary', 'Talk-Show', ' Adult', 'Reality-TV', 'Thriller', 'History', 'Sport', 'Mystery', 'Comedy', ' Adventure', 'Animation', ' News', ' Short', ' Thriller', 'N/A'}
57


In [12]:
# Add columns based on new generes
genres_dic = {j:[1 if j in i else 0 for i in movies["Genre"]] for j in genres}
# Plotting movies per genre
data = [go.Bar(
            x=list(genres_dic.keys()),
            y=[sum(i) for i in genres_dic.values()]
    )]
fig = go.Figure(data=data)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(
    title="Movies per Genre",
    xaxis_title="Genres",
    yaxis_title="Number of Movies",
)

iplot(fig)
# There is an N/A genre - removing
del genres_dic['N/A']

### Similarly encoding languages

In [13]:
# split genres and printing
languages = set([j for i in [i.split(",") for i in movies['Language']] for j in i])
print(len(languages))

187


In [14]:
# Add columns based on new generes
languages_dic = {j:[1 if j in i else 0 for i in movies["Language"]] for j in languages}
# Plotting movies per genre
data = [go.Bar(
            x=list(languages_dic.keys()),
            y=[sum(i) for i in languages_dic.values()]
    )]
fig = go.Figure(data=data)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(
    title="Movies per Language",
    xaxis_title="Languages",
    yaxis_title="Number of Movies",
)

iplot(fig)

### Above graph is interactive - and I know there are so many English movies!
### Hence we will not select languages for our analysis

In [15]:
# Now we will add encoded generes and languages
for k,v in genres_dic.items():
    movies[k]=v
#for k,v in languages_dic.items():
#    movies[k]=v
# Dropping rows with no genres listed
movies = movies[movies.Genre != "N/A"]
# We don't need genres column anymore
del movies["Genre"]
del movies["Language"]

# Also converting imdb votes to numeric
movies["imdbVotes"] = movies['imdbVotes'].str.replace(",", "").astype(int)

movies.tail()

Unnamed: 0,imdbID,Title,Ratings,imdbRating,imdbVotes,Animation,Fantasy,Game-Show,Sci-Fi,Horror,...,Thriller,History,Sport,Mystery,Comedy,Adventure,Animation.1,News,Short,Thriller.1
7029,tt0258760,Lammbock,"[{'Source': 'Internet Movie Database', 'Value'...",7.3,10180,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7030,tt0146455,Babylon 5: A Call to Arms,"[{'Source': 'Internet Movie Database', 'Value'...",7.2,4976,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
7031,tt0432047,Sarkar,"[{'Source': 'Internet Movie Database', 'Value'...",7.7,14199,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7032,tt0109515,Cyber Tracker,"[{'Source': 'Internet Movie Database', 'Value'...",3.5,971,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7033,tt0485161,Totally Awesome,"[{'Source': 'Internet Movie Database', 'Value'...",5.2,1625,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


### Using K-means clustering

In [16]:
# Here I am selecting ratings, votes, genres and languages to train
X = movies.replace([np.inf, -np.inf], np.nan).dropna().iloc[:,3:].to_numpy()
# I am selecting 50 clusters based on number of languages, but this can be changed later
# I roughly selected the number of clusters, but this can be changed for e.g. based on genres
kmeans = KMeans(n_clusters=15, random_state=4).fit(X)

### Using DBSCAN

In [38]:
dbscan = DBSCAN(min_samples=2).fit(X)

In [46]:
print(np.unique(dbscan.labels_))
dbscan_dic = {i:[] for i in dbscan.labels_}
[dbscan_dic[i].append(list(movies["Title"])[c]) for c, i in enumerate(dbscan.labels_)]
print([len(i) for i in dbscan_dic.values()])

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
 47]
[6482, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


Looks like there are too many outlier movies for dbscan to work.

### Agglomerative Clustering

In [53]:
ac = AgglomerativeClustering(n_clusters = 25).fit(X)

In [54]:
print(np.unique(ac.labels_))
ac_dic = {i:[] for i in ac.labels_}
[ac_dic[i].append(list(movies["Title"])[c]) for c, i in enumerate(ac.labels_)]
print([len(i) for i in ac_dic.values()])

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24]
[4398, 1111, 209, 54, 73, 11, 344, 1, 168, 51, 11, 3, 25, 4, 26, 37, 15, 15, 7, 8, 1, 5, 1, 1, 1]


### Recommendation Function

In [17]:
comb_dic = {i:[] for i in kmeans.labels_}
[comb_dic[i].append(list(movies["Title"])[c]) for c, i in enumerate(kmeans.labels_)]
print([len(i) for i in comb_dic.values()])

[5433, 583, 211, 116, 30, 5, 55, 15, 74, 5, 35, 11, 4, 2, 1]


In [55]:
# Simplest function - note that it may give same value as recommendation as it randomly selects the movie
# This can be changed later
def movie_recommendation(x,model,dic):
    return random.choice(dic[model.labels_[list(movies["Title"]).index(x)]])

print(movie_recommendation("Totally Awesome",kmeans,comb_dic))
print(movie_recommendation("Totally Awesome",dbscan,dbscan_dic))
print(movie_recommendation("Totally Awesome",ac,ac_dic))

Buffalo Bill
Four Eyed Monsters
Central Airport


In [61]:
print(movie_recommendation("Star Wars: Episode VI - Return of the Jedi",kmeans,comb_dic))
print(movie_recommendation("Star Wars: Episode VI - Return of the Jedi",dbscan,dbscan_dic))
print(movie_recommendation("Star Wars: Episode VI - Return of the Jedi",ac,ac_dic))

A Beautiful Mind
Lammbock
Star Wars: Episode VI - Return of the Jedi


In [63]:
print(movie_recommendation("Good Will Hunting",kmeans,comb_dic))
print(movie_recommendation("Good Will Hunting",dbscan,dbscan_dic))
print(movie_recommendation("Good Will Hunting",ac,ac_dic))

Star Wars: Episode VI - Return of the Jedi
Britney & Kevin: Chaotic
A Beautiful Mind
