In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv("movie_dataset.csv")

In [3]:
data.iloc[0]

index                                                                   0
budget                                                          237000000
genres                           Action Adventure Fantasy Science Fiction
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                culture clash future space war space colony so...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                        150.438
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                               

In [4]:
features = ["original_title", 'cast', 'keywords','genres','director', 'release_date']

In [5]:
for feature in features:
    data[feature] = data[feature].fillna('')

In [25]:
def combine_features(row):
    try:
        return row['keywords'] +" "+row['cast']+" "+row["genres"]+" "+row["director"]
    except:
        print("Error:", row)

data["combined_features"] = data.apply(combine_features,axis=1)

In [26]:
data.combined_features

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: combined_features, Length: 4803, dtype: object

In [29]:
data.director.values

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Daniel Hsia', 'Brian Herzlinger'], dtype=object)

In [16]:
data[features]

Unnamed: 0,original_title,cast,keywords,genres,director,release_date
0,Avatar,Sam Worthington Zoe Saldana Sigourney Weaver S...,culture clash future space war space colony so...,Action Adventure Fantasy Science Fiction,James Cameron,2009-12-10
1,Pirates of the Caribbean: At World's End,Johnny Depp Orlando Bloom Keira Knightley Stel...,ocean drug abuse exotic island east india trad...,Adventure Fantasy Action,Gore Verbinski,2007-05-19
2,Spectre,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,spy based on novel secret agent sequel mi6,Action Adventure Crime,Sam Mendes,2015-10-26
3,The Dark Knight Rises,Christian Bale Michael Caine Gary Oldman Anne ...,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thriller,Christopher Nolan,2012-07-16
4,John Carter,Taylor Kitsch Lynn Collins Samantha Morton Wil...,based on novel mars medallion space travel pri...,Action Adventure Science Fiction,Andrew Stanton,2012-03-07
...,...,...,...,...,...,...
4798,El Mariachi,Carlos Gallardo Jaime de Hoyos Peter Marquardt...,united states\u2013mexico barrier legs arms pa...,Action Crime Thriller,Robert Rodriguez,1992-09-04
4799,Newlyweds,Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...,,Comedy Romance,Edward Burns,2011-12-26
4800,"Signed, Sealed, Delivered",Eric Mabius Kristin Booth Crystal Lowe Geoff G...,date love at first sight narration investigati...,Comedy Drama Romance TV Movie,Scott Smith,2013-10-13
4801,Shanghai Calling,Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...,,,Daniel Hsia,2012-05-03


In [21]:
data.iloc[range(10)]

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton
5,5,258000000,Fantasy Action Adventure,http://www.sonypictures.com/movies/spider-man3/,559,dual identity amnesia sandstorm love of one's ...,en,Spider-Man 3,The seemingly invincible Spider-Man goes up ag...,115.699814,...,139.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,The battle within.,Spider-Man 3,5.9,3576,Tobey Maguire Kirsten Dunst James Franco Thoma...,"[{'name': 'Francine Maisler', 'gender': 1, 'de...",Sam Raimi
6,6,260000000,Animation Family,http://disney.go.com/disneypictures/tangled/,38757,hostage magic horse fairy tale musical,en,Tangled,When the kingdom's most wanted-and most charmi...,48.681969,...,100.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,They're taking adventure to new lengths.,Tangled,7.4,3330,Zachary Levi Mandy Moore Donna Murphy Ron Perl...,"[{'name': 'John Lasseter', 'gender': 2, 'depar...",Byron Howard
7,7,280000000,Action Adventure Science Fiction,http://marvel.com/movies/movie/193/avengers_ag...,99861,marvel comic sequel superhero based on comic b...,en,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,134.279229,...,141.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Age Has Come.,Avengers: Age of Ultron,7.3,6767,Robert Downey Jr. Chris Hemsworth Mark Ruffalo...,"[{'name': 'Danny Elfman', 'gender': 2, 'depart...",Joss Whedon
8,8,250000000,Adventure Fantasy Family,http://harrypotter.warnerbros.com/harrypottera...,767,witch magic broom school of witchcraft wizardry,en,Harry Potter and the Half-Blood Prince,"As Harry begins his sixth year at Hogwarts, he...",98.885637,...,153.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Dark Secrets Revealed,Harry Potter and the Half-Blood Prince,7.4,5293,Daniel Radcliffe Rupert Grint Emma Watson Tom ...,"[{'name': 'Bruno Delbonnel', 'gender': 0, 'dep...",David Yates
9,9,250000000,Action Adventure Fantasy,http://www.batmanvsupermandawnofjustice.com/,209112,dc comics vigilante superhero based on comic b...,en,Batman v Superman: Dawn of Justice,Fearing the actions of a god-like Super Hero l...,155.790452,...,151.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Justice or revenge,Batman v Superman: Dawn of Justice,5.7,7004,Ben Affleck Henry Cavill Gal Gadot Amy Adams J...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Zack Snyder


In [31]:
kdd = KDTree(data[features])

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [32]:
dataA = pd.DataFrame(pd.np.random.rand(100, 100))
dataB = pd.DataFrame(pd.np.random.rand(100, 100))

In [34]:
kdB

<scipy.spatial.kdtree.KDTree at 0x7f5ff2ad1760>

In [46]:
cv = CountVectorizer()

count_matrix = cv.fit_transform(data["combined_features"])
cosine_sim = cosine_similarity(count_matrix)

In [62]:
t = count_matrix.toarray()

In [72]:
len(data.director.unique())

2350

In [76]:
len(data.keywords.unique())

4220

In [79]:
data.director.value_counts()

                     30
Steven Spielberg     27
Woody Allen          21
Clint Eastwood       20
Martin Scorsese      20
                     ..
Moustapha Akkad       1
Emma-Kate Croghan     1
Eric Bugbee           1
Deon Taylor           1
David Boyd            1
Name: director, Length: 2350, dtype: int64