In [1]:
!curl https://files.grouplens.org/datasets/movielens/ml-25m.zip -o ml-25m.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  249M  100  249M    0     0  68.9M      0  0:00:03  0:00:03 --:--:-- 68.9M


In [2]:
!unzip ml-25m.zip

Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


In [3]:
import pandas as pd

movies = pd.read_csv("ml-25m/movies.csv")

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [5]:
import re

def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [6]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [7]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [10]:
search("Toy Story")

Unnamed: 0,movieId,title,genres,clean_title
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [11]:
search("Marvel")

Unnamed: 0,movieId,title,genres,clean_title
17908,93520,Adventures of Captain Marvel (1941),Action|Adventure|Fantasy|Mystery|Sci-Fi,Adventures of Captain Marvel 1941
25066,122910,Captain Marvel (2018),Action|Adventure|Sci-Fi,Captain Marvel 2018
57082,195575,Marvel Mon Amour,Documentary,Marvel Mon Amour
62334,208787,Marvel Renaissance (2014),Documentary,Marvel Renaissance 2014
56403,194088,Marvel Rising: Secret Warriors (2018),Action|Animation|Comedy,Marvel Rising Secret Warriors 2018


In [12]:
movie_id = 89745
movie = movies[movies["movieId"] == movie_id]

In [13]:
ratings = pd.read_csv("ml-25m/ratings.csv")

In [14]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [15]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [16]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [17]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [18]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [19]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [20]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [21]:
rec_percentages

Unnamed: 0,similar,all
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887


In [22]:
def find_similar_movies(movie):
    results = search(movie)
    movie_id = results.iloc[0]["movieId"]
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(5).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [23]:
find_similar_movies("Toy Story")

Unnamed: 0,score,title,genres
3021,18.841924,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,8.210086,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
2669,6.868954,"Iron Giant, The (1999)",Adventure|Animation|Children|Drama|Sci-Fi
14813,6.503216,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
3650,6.272875,Chicken Run (2000),Animation|Children|Comedy


In [24]:
find_similar_movies("Jumanji")

Unnamed: 0,score,title,genres
1,57.008249,Jumanji (1995),Adventure|Children|Fantasy
156,18.757121,Casper (1995),Adventure|Children
313,14.88039,"Santa Clause, The (1994)",Comedy|Drama|Fantasy
578,9.382034,Home Alone (1990),Children|Comedy
495,8.71198,Mrs. Doubtfire (1993),Comedy|Drama
