# Movie Recommendation

## IMDB movie recommendation notebook

#### Author/s: Ruturaj Kiran Vaidya
(Add you name if you edit/improve)

In [1]:
# Imports

# Genral

import pandas as pd
import numpy as np
import random

# graphs

from plotly import graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Model
# sklearn
from sklearn.cluster import KMeans

In [2]:
train = pd.read_json("TrainMovieDetails.json")
test = pd.read_json("TestMovieDetails.json")

In [3]:
train.keys()

Index(['Actors', 'Awards', 'BoxOffice', 'Country', 'DVD', 'Director',
       'Episode', 'Genre', 'Language', 'Metascore', 'Plot', 'Poster',
       'Production', 'Rated', 'Ratings', 'Released', 'Response', 'Runtime',
       'Season', 'Title', 'Type', 'Website', 'Writer', 'Year', 'imdbID',
       'imdbRating', 'imdbVotes', 'seriesID', 'totalSeasons'],
      dtype='object')

In [4]:
test.keys()

Index(['Actors', 'Awards', 'BoxOffice', 'Country', 'DVD', 'Director',
       'Episode', 'Error', 'Genre', 'Language', 'Metascore', 'Plot', 'Poster',
       'Production', 'Rated', 'Ratings', 'Released', 'Response', 'Runtime',
       'Season', 'Title', 'Type', 'Website', 'Writer', 'Year', 'imdbID',
       'imdbRating', 'imdbVotes', 'seriesID', 'totalSeasons'],
      dtype='object')

In [5]:
del test["Error"]

In [6]:
test.head()

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Episode,Genre,Language,Metascore,...,Title,Type,Website,Writer,Year,imdbID,imdbRating,imdbVotes,seriesID,totalSeasons
0,"Kevin Costner, Ashton Kutcher, Sela Ward, Meli...",1 win & 4 nominations.,"$54,983,983",USA,23 Jan 2007,Andrew Davis,,"Action, Adventure, Drama",English,53.0,...,The Guardian,movie,,Ron L. Brinkerhoff,2006,tt0406816,6.9,84017,,
1,"Albert Finney, Tom Courtenay, Edward Fox, Zena...",Nominated for 5 Oscars. Another 5 wins & 12 no...,,UK,06 Apr 2004,Peter Yates,,Drama,English,,...,The Dresser,movie,,"Ronald Harwood (screenplay), Ronald Harwood (b...",1983,tt0085461,7.7,4285,,
2,"S. Epatha Merkerson, Marcus Carl Franklin, Yas...",Won 1 Golden Globe. Another 29 wins & 41 nomin...,,USA,23 Aug 2005,George C. Wolfe,,"Drama, Music",English,,...,Lackawanna Blues,movie,,"Ruben Santiago-Hudson (play), Ruben Santiago-H...",2005,tt0407936,7.9,2150,,
3,"Julie Andrews, Rock Hudson, Jeremy Kemp, Lance...",Nominated for 3 Oscars. Another 1 win & 4 nomi...,,USA,25 Oct 2005,Blake Edwards,,"Comedy, Drama, Musical, Romance, War","English, German",,...,Darling Lili,movie,,"Blake Edwards, William Peter Blatty",1970,tt0065611,6.1,1826,,
4,Mel Blanc,,,USA,,Robert McKimson,,"Family, Animation, Short",English,,...,Acrobatty Bunny,movie,,Warren Foster (story),1946,tt0038286,7.3,663,,


In [7]:
train.head()

Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Episode,Genre,Language,Metascore,...,Title,Type,Website,Writer,Year,imdbID,imdbRating,imdbVotes,seriesID,totalSeasons
0,"Seth Allen, John Bakos, Mari-Claire Charba, Pe...",,,USA,,Tom O'Horgan,,Comedy,English,,...,Futz,movie,,"Rochelle Owens (play), Joseph Stefano (screenp...",1969,tt0064354,4.8,50.0,,
1,"Jane Fonda, Robert De Niro, Swoosie Kurtz, Mar...",,,"USA, Canada",13 Jan 2004,Martin Ritt,,"Drama, Romance",English,,...,Stanley & Iris,movie,,"Pat Barker (novel), Harriet Frank Jr. (screenp...",1990,tt0100680,6.3,6502.0,,
2,"Christopher Walken, Jason Connery, Carmela Mar...",,,USA,09 Aug 2005,Eugene Marner,,"Fantasy, Musical, Family",English,,...,Puss in Boots,movie,,"Charles Perrault (fairy tale ""Le chat botté""),...",1988,tt0177606,6.4,537.0,,
3,,,,,,,,,,,...,#DUPE#,movie,,,1972,tt0068444,,,,
4,"Shemp Howard, Larry Fine, Moe Howard, Christin...",,,USA,,Edward Bernds,,"Comedy, Short",English,,...,Who Done It?,movie,,"Edward Bernds (story), Edward Bernds (screenplay)",1949,tt0042042,8.2,335.0,,


In [8]:
combined = pd.concat([train, test], ignore_index=True)
movies = combined[["imdbID", "Title", "Ratings", "imdbRating", "imdbVotes", "Language", "Genre"]].dropna()
movies.drop_duplicates(subset=['imdbID'], keep=False, inplace=True)

In [9]:
movies.tail()

Unnamed: 0,imdbID,Title,Ratings,imdbRating,imdbVotes,Language,Genre
7029,tt0258760,Lammbock,"[{'Source': 'Internet Movie Database', 'Value'...",7.3,10180,German,Comedy
7030,tt0146455,Babylon 5: A Call to Arms,"[{'Source': 'Internet Movie Database', 'Value'...",7.2,4976,English,"Action, Adventure, Drama, Sci-Fi"
7031,tt0432047,Sarkar,"[{'Source': 'Internet Movie Database', 'Value'...",7.7,14199,"Hindi, Marathi","Crime, Drama"
7032,tt0109515,Cyber Tracker,"[{'Source': 'Internet Movie Database', 'Value'...",3.5,971,English,"Action, Sci-Fi"
7033,tt0485161,Totally Awesome,"[{'Source': 'Internet Movie Database', 'Value'...",5.2,1625,English,Comedy


In [10]:
movies.shape

(6914, 7)

### Encoding genres

In [11]:
# split genres and printing
genres = set([j for i in [i.split(",") for i in movies['Genre']] for j in i])
print(genres)
print(len(genres))

{' Sci-Fi', ' Short', ' Biography', 'Horror', ' Mystery', ' Musical', ' Drama', ' Game-Show', 'Sci-Fi', ' Family', 'Family', 'Biography', 'Documentary', 'Sport', 'War', ' Adventure', ' Adult', ' History', ' Film-Noir', 'Game-Show', ' Crime', ' Reality-TV', ' Action', 'History', 'Adventure', ' Documentary', 'Musical', ' Romance', 'Short', 'Romance', 'Music', 'Fantasy', ' Fantasy', ' Music', ' Horror', ' Sport', 'Action', ' Talk-Show', 'Crime', 'Thriller', ' Thriller', ' War', 'Reality-TV', 'News', 'Talk-Show', 'Mystery', 'Adult', 'Animation', ' Western', 'Drama', 'N/A', 'Film-Noir', 'Western', ' News', ' Animation', 'Comedy', ' Comedy'}
57


In [12]:
# Add columns based on new generes
genres_dic = {j:[1 if j in i else 0 for i in movies["Genre"]] for j in genres}
# Plotting movies per genre
data = [go.Bar(
            x=list(genres_dic.keys()),
            y=[sum(i) for i in genres_dic.values()]
    )]
fig = go.Figure(data=data)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(
    title="Movies per Genre",
    xaxis_title="Genres",
    yaxis_title="Number of Movies",
)

iplot(fig)
# There is an N/A genre - removing
del genres_dic['N/A']

### Similarly encoding languages

In [13]:
# split genres and printing
languages = set([j for i in [i.split(",") for i in movies['Language']] for j in i])
print(len(languages))

187


In [14]:
# Add columns based on new generes
languages_dic = {j:[1 if j in i else 0 for i in movies["Language"]] for j in languages}
# Plotting movies per genre
data = [go.Bar(
            x=list(languages_dic.keys()),
            y=[sum(i) for i in languages_dic.values()]
    )]
fig = go.Figure(data=data)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                  marker_line_width=1.5, opacity=0.6)
fig.update_layout(
    title="Movies per Language",
    xaxis_title="Languages",
    yaxis_title="Number of Movies",
)

iplot(fig)

### Above graph is interactive - and I know there are so many English movies!
### Hence we will not select languages for our analysis

In [15]:
# Now we will add encoded generes and languages
for k,v in genres_dic.items():
    movies[k]=v
#for k,v in languages_dic.items():
#    movies[k]=v
# Dropping rows with no genres listed
movies = movies[movies.Genre != "N/A"]
# We don't need genres column anymore
del movies["Genre"]
del movies["Language"]

# Also converting imdb votes to numeric
movies["imdbVotes"] = movies['imdbVotes'].str.replace(",", "").astype(int)

movies.tail()

Unnamed: 0,imdbID,Title,Ratings,imdbRating,imdbVotes,Sci-Fi,Short,Biography,Horror,Mystery,...,Adult,Animation,Western,Drama,Film-Noir,Western.1,News,Animation.1,Comedy,Comedy.1
7029,tt0258760,Lammbock,"[{'Source': 'Internet Movie Database', 'Value'...",7.3,10180,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
7030,tt0146455,Babylon 5: A Call to Arms,"[{'Source': 'Internet Movie Database', 'Value'...",7.2,4976,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7031,tt0432047,Sarkar,"[{'Source': 'Internet Movie Database', 'Value'...",7.7,14199,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7032,tt0109515,Cyber Tracker,"[{'Source': 'Internet Movie Database', 'Value'...",3.5,971,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7033,tt0485161,Totally Awesome,"[{'Source': 'Internet Movie Database', 'Value'...",5.2,1625,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Using K-means clustering

In [16]:
# Here I am selecting ratings, votes, genres and languages to train
X = movies.replace([np.inf, -np.inf], np.nan).dropna().iloc[:,3:].to_numpy()
# I am selecting 50 clusters based on number of languages, but this can be changed later
# I roughly selected the number of clusters, but this can be changed for e.g. based on genres
kmeans = KMeans(n_clusters=15, random_state=4).fit(X)

### Recommendation Function

In [17]:
comb_dic = {i:[] for i in kmeans.labels_}
[comb_dic[i].append(list(movies["Title"])[c]) for c, i in enumerate(kmeans.labels_)]
print([len(i) for i in comb_dic.values()])

[5439, 581, 219, 118, 26, 4, 86, 15, 40, 4, 28, 11, 2, 5, 2]


In [18]:
# Simplest function - note that it may give same value as recommendation as it randomly selects the movie
# This can be changed later
def movie_recommendation(x):
    return random.choice(comb_dic[kmeans.labels_[list(movies["Title"]).index(x)]])

movie_recommendation("Totally Awesome")

'Galaxies Are Colliding'