In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [12]:
critics = {
    "Lisa Rose": {
        "Lady in the Water": 2.5,
        "Snakes on a Plane": 3.5,
        "Just My Luck": 3.0,
        "Superman Returns": 3.5,
        "You, Me and Dupree": 2.5,
        "The Night Listener": 3.0,
    },
    "Gene Seymour": {
        "Lady in the Water": 3.0,
        "Snakes on a Plane": 3.5,
        "Just My Luck": 1.5,
        "Superman Returns": 5.0,
        "The Night Listener": 3.0,
        "You, Me and Dupree": 3.5,
    },
    "Michael Phillips": {
        "Lady in the Water": 2.5,
        "Snakes on a Plane": 3.0,
        "Superman Returns": 3.5,
        "The Night Listener": 4.0,
    },
    "Claudia Puig": {
        "Snakes on a Plane": 3.5,
        "Just My Luck": 3.0,
        "The Night Listener": 4.5,
        "Superman Returns": 4.0,
        "You, Me and Dupree": 2.5,
    },
    "Mick LaSalle": {
        "Lady in the Water": 3.0,
        "Snakes on a Plane": 4.0,
        "Just My Luck": 2.0,
        "Superman Returns": 3.0,
        "The Night Listener": 3.0,
        "You, Me and Dupree": 2.0,
    },
    "Jack Matthews": {
        "Lady in the Water": 3.0,
        "Snakes on a Plane": 4.0,
        "The Night Listener": 3.0,
        "Superman Returns": 5.0,
        "You, Me and Dupree": 3.5,
    },
    "Toby": {
        "Snakes on a Plane": 4.5,
        "You, Me and Dupree": 1.0,
        "Superman Returns": 4.0,
    },
}

df = pd.DataFrame(critics)
df

Unnamed: 0,Lisa Rose,Gene Seymour,Michael Phillips,Claudia Puig,Mick LaSalle,Jack Matthews,Toby
Lady in the Water,2.5,3.0,2.5,,3.0,3.0,
Snakes on a Plane,3.5,3.5,3.0,3.5,4.0,4.0,4.5
Just My Luck,3.0,1.5,,3.0,2.0,,
Superman Returns,3.5,5.0,3.5,4.0,3.0,5.0,4.0
"You, Me and Dupree",2.5,3.5,,2.5,2.0,3.5,1.0
The Night Listener,3.0,3.0,4.0,4.5,3.0,3.0,


In [14]:
# NearestNeighbors complains when there is NA.
df.fillna(0, inplace=True)

In [48]:
nn = NearestNeighbors(n_neighbors=5, metric="cosine")
nn.fit(df)

In [49]:
df.iloc[2:3]

Unnamed: 0,Lisa Rose,Gene Seymour,Michael Phillips,Claudia Puig,Mick LaSalle,Jack Matthews,Toby
Just My Luck,3.0,1.5,0.0,3.0,2.0,0.0,0.0


In [52]:
distances, indices = nn.kneighbors(df.iloc[2:3], return_distance=True)

# Flatten the 2d matrix, and also remove the first item, which is itself with zero distance.
distances = distances.flatten()[1:]
indices = indices.flatten()[1:]
distances, indices

(array([0.21161357, 0.24014412, 0.29742666, 0.31977023]), array([5, 4, 1, 3]))

In [66]:
print(f"Similar movies to '{df.index[2]}':")
for i, (movie, score) in enumerate(zip(df.index[indices].values, distances)):
    print(f"{i+1}.", movie, score)

Similar movies to 'Just My Luck':
1. The Night Listener 0.21161356589624858
2. You, Me and Dupree 0.24014412394128826
3. Snakes on a Plane 0.29742665906692767
4. Superman Returns 0.3197702261832609


## Nearest Neigbors movie recommendations

In [68]:
from movie_helper.movie import load_movies

movies = load_movies()
movies.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [73]:
movie_genres = movies[
    [
        "unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",
    ]
]
movie_genres.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [78]:
nn = NearestNeighbors(n_neighbors=10, metric="cosine")
nn.fit(movie_genres)

In [111]:
movies[movies.movie_title.str.match("star", case=False)].head(5)

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
49,50,Star Wars (1977),01-Jan-1977,,http://us.imdb.com/M/title-exact?Star%20Wars%2...,0,1,1,0,0,...,0,0,0,0,0,1,1,0,1,0
61,62,Stargate (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Stargate%20(1...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
221,222,Star Trek: First Contact (1996),22-Nov-1996,,http://us.imdb.com/M/title-exact?Star%20Trek:%...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
226,227,Star Trek VI: The Undiscovered Country (1991),01-Jan-1991,,http://us.imdb.com/M/title-exact?Star%20Trek%2...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
227,228,Star Trek: The Wrath of Khan (1982),01-Jan-1982,,http://us.imdb.com/M/title-exact?Star%20Trek:%...,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [113]:
movie_index = 49
movies.iloc[movie_index].movie_title

'Star Wars (1977)'

In [114]:
distances, indices = nn.kneighbors(movie_genres.iloc[movie_index : movie_index + 1])
distances = distances.flatten()
indices = indices.flatten()

# Exclude the same id.
masks = np.where(indices != movie_index)
indices = indices[masks]
distances = distances[masks]

In [115]:
recs = movies.iloc[indices]
recs["scores"] = 1 / (1 + distances)
recs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs["scores"] = 1 / (1 + distances)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,scores
180,181,Return of the Jedi (1983),14-Mar-1997,,http://us.imdb.com/M/title-exact?Return%20of%2...,0,1,1,0,0,...,0,0,0,0,1,1,0,1,0,1.0
171,172,"Empire Strikes Back, The (1980)",01-Jan-1980,,http://us.imdb.com/M/title-exact?Empire%20Stri...,0,1,1,0,0,...,0,0,0,0,1,1,0,1,0,0.919854
497,498,"African Queen, The (1951)",01-Jan-1951,,http://us.imdb.com/M/title-exact?African%20Que...,0,1,1,0,0,...,0,0,0,0,1,0,0,1,0,0.904508
270,271,Starship Troopers (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?Starship+Troo...,0,1,1,0,0,...,0,0,0,0,0,1,0,1,0,0.904508
221,222,Star Trek: First Contact (1996),22-Nov-1996,,http://us.imdb.com/M/title-exact?Star%20Trek:%...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0.816058
81,82,Jurassic Park (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Jurassic%20Pa...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0.816058
372,373,Judge Dredd (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Judge%20Dredd...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0.816058
227,228,Star Trek: The Wrath of Khan (1982),01-Jan-1982,,http://us.imdb.com/M/title-exact?Star%20Trek:%...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0.816058
120,121,Independence Day (ID4) (1996),03-Jul-1996,,http://us.imdb.com/M/title-exact?Independence%...,0,1,0,0,0,...,0,0,0,0,0,1,0,1,0,0.816058
