# Pre-processing and Modeling

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# read the cleaned movies data set
movies = pd.read_csv('/Users/Atabay/Desktop/Movie_Recommendation_System_data/Data/m_data_cleaned.csv')

In [3]:
movies.head()

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,John Carter,6.1,2124


In [4]:
movies.shape

(4802, 17)

In [5]:
credits = pd.read_csv('/Users/Atabay/Desktop/Movie_Recommendation_System_data/Data/c_data_cleaned.csv')

In [6]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


## Demographic Filtering

In the demographic filtering, I will use vote average as the only decisive factor to rank the movies. However, some movies have very low vote count which would make the ranking unfair. To solve the issue I will only include the movies with considerable vote avarage to my ranking.

I will use 99 percentile as my cut off for vote count because we have 44035 movies in our data and most of have very few votes which may cause ranking to be biased. 

In [7]:
m_90 = movies['vote_count'].quantile(0.90)
m_90

1839.2000000000044

In [8]:
r_90 = movies.copy().loc[movies['vote_count'] >= m_90]
r_90.shape

(481, 17)

We have 481 movies that have voting cote more than 1839. I will use only those movies in my ranking.

In [9]:
r_90 = r_90.sort_values('vote_average', ascending=False)

In [10]:
# Displaying top 20 movies
r_90[['title', 'vote_count', 'vote_average']].head(20)

Unnamed: 0,title,vote_count,vote_average
1881,The Shawshank Redemption,8205,8.5
3337,The Godfather,5893,8.4
2294,Spirited Away,3840,8.3
662,Fight Club,9413,8.3
1818,Schindler's List,4329,8.3
3865,Whiplash,4254,8.3
3232,Pulp Fiction,8428,8.3
2731,The Godfather: Part II,3338,8.3
4601,12 Angry Men,2078,8.2
690,The Green Mile,4048,8.2


This ranking can be used for simple movie recommendation to users. It is not influenced by any user preference or choice therefore It will be basic and same for everyone.

## Content Based Filtering

### a. Plot based recommender 

I will build a content based filtering using similarity scores of words in overview column

In [11]:
movies.overview.head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [12]:
# Importing the neccessary library
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Defining a tdif vectorizer and removing all the english words stop words
tfidf = TfidfVectorizer(stop_words='english')

In [14]:
#Constructing the required tfidf matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies['overview'])

In [15]:
tfidf_matrix.shape

(4802, 20980)

It seems that there are over 20980 words to describe 4802 movies

In [16]:
# Importing linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

We need to define a function taking movie title as an input and recommending 5 similar movies as an output

In [17]:
#Creating a reverse map of indexes and movie titles
indexes = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [18]:
indexes

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4797
Newlyweds                                   4798
Signed, Sealed, Delivered                   4799
Shanghai Calling                            4800
My Date with Drew                           4801
Length: 4802, dtype: int64

#### Defining the recommendation function

In [94]:
# Function that takes in movie title as input and outputs the most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Index of the movie that matches the title
    index = indexes[title]

    # Pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]

    # Get the movie indixes
    movie_indexes = [i[0] for i in sim_scores]

    # Return the top 5 most similar movies
    return movies['title'].iloc[movie_indexes]

In [20]:
get_recommendations('Avatar')

3604               Apollo 18
2130            The American
634               The Matrix
1341    The Inhabited Island
529         Tears of the Sun
Name: title, dtype: object

In [21]:
get_recommendations('Spirited Away')

1825            Jimmy Neutron: Boy Genius
4599                      51 Birch Street
131                               G-Force
2394                               Wolves
117     Charlie and the Chocolate Factory
Name: title, dtype: object

The recommendation system does a good job returning movies with similar plot descriptions. However, we can increase the quality of the recommendation system by adding more metadata.

### b. Adding More Data to Recommendation Engine

I will be using cast and crew columns to extract the 3 top actors, and director info. Also, I will be using genre info to improve my recommendation engine. 

In [22]:
credits.columns = ['id','title_credits','cast','crew']

In [23]:
credits.head()

Unnamed: 0,id,title_credits,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [24]:
# Merging movies and credits data
df = movies.merge(credits,on='id')

In [25]:
df.head()

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,runtime,spoken_languages,status,title,vote_average,vote_count,title_credits,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


Our data is in the string form we need to convert it into usable structure.

In [26]:
# Reorganizing the string features 
from ast import literal_eval

features = ['cast', 'crew', 'genres']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)

In [27]:
# From the crew column getting the director's name. If director is not found, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [28]:
# Returning the top 3 element from a list
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing data
    return []

In [29]:
# Get director from crew column and get top 3 elements of cast and genre columns
df['director'] = df['crew'].apply(get_director)
 
features = ['cast', 'genres']
for feature in features:
    df[feature] = df[feature].apply(get_list)

In [30]:
# Get an overview of the new features
df[['title', 'cast', 'director', 'genres']].head()

Unnamed: 0,title,cast,director,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[Action, Crime, Drama]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[Action, Adventure, Science Fiction]"


We need to strip and lowercase names to avoid confusion between the values having the same first names.

In [31]:
# Striping blank spaces and converting to lower case 
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Checking if director exists, if not return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [32]:
# Apply clean_data function to your features.
features = ['cast', 'director', 'genres']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [33]:
def movie_soup(x):
    return ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df['soup'] = df.apply(movie_soup, axis=1)

Applying Count Vectorizer instead of tfidf because word frequency is important for our engine. We don't need to penalize most frequent words.

In [34]:
# Import CountVectorizer and create the matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
c_matrix = count.fit_transform(df['soup'])

In [35]:
# Computing the Cosine Similarity matrix based on the c_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(c_matrix, c_matrix)

In [36]:
# Resetting index of our main DataFrame and creating reverse mapping
df = df.reset_index()
indixes = pd.Series(df.index, index=df['title'])

Comparing the two recommendation engine I created

In [37]:
get_recommendations('Avatar', cosine_sim)

3604               Apollo 18
2130            The American
634               The Matrix
1341    The Inhabited Island
529         Tears of the Sun
Name: title, dtype: object

In [38]:
get_recommendations('Avatar', cosine_sim2)

206                         Clash of the Titans
1      Pirates of the Caribbean: At World's End
5                                  Spider-Man 3
9            Batman v Superman: Dawn of Justice
10                             Superman Returns
Name: title, dtype: object

All five movie recommendations are different for the engines. Adding more metadata to our engine led to better results.

## Collabrative Filtering

Content based recommendation system has some disadvantages. It fails to capture users' taste and preference. Therefore, I will use collabrative filtering to build a recommendation engine considering user' preferences.

Collabrative filtering aims to predict how much a user likely to like a product by considering preferences of similar users.

I will not implement Collaborative Filtering from scratch. Instead, I will use the Surprise library that used  powerful algorithms like Singular Value Decomposition (SVD) to minimise RMSE (Root Mean Square Error) and give good recommendations.

SVD calculates similarty between both users and movies to give predictions which enables us to use advantages of both user and item based collabrative filtering.

In [39]:
pip install surprise

Note: you may need to restart the kernel to use updated packages.


In [40]:
# I will use Surprise library to apply SVD
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate
reader = Reader()

In [41]:
# Because we need to have user id to make predictions, I will use a different dataset.
ratings = pd.read_csv('/Users/Atabay/Desktop/Movie_Recommendation_System_data/Raw_data/ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [49]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8997  0.8933  0.9033  0.8976  0.8904  0.8969  0.0046  
MAE (testset)     0.6919  0.6874  0.6957  0.6901  0.6860  0.6902  0.0034  
Fit time          0.77    0.72    0.76    0.93    0.79    0.79    0.07    
Test time         0.44    0.14    0.20    0.12    0.19    0.22    0.11    


{'test_rmse': array([0.8997435 , 0.89331488, 0.90330984, 0.89764306, 0.89043149]),
 'test_mae': array([0.69191806, 0.68741211, 0.6957128 , 0.69008084, 0.68601642]),
 'fit_time': (0.7716090679168701,
  0.7156479358673096,
  0.7603578567504883,
  0.9322319030761719,
  0.7940728664398193),
 'test_time': (0.43981194496154785,
  0.13672089576721191,
  0.20443081855773926,
  0.12099099159240723,
  0.19324231147766113)}

Mean of RMSE is 0.89 is good enough for the engine.

In [50]:
# Training the dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9698f7abb0>

I want to add title of the movie to ratings table to be able to observe the results better.

In [52]:
user_rating=pd.merge(ratings,df,left_on='movieId',right_on='id',how='inner')
user_ratings_final=user_rating[['userId', 'movieId', 'rating','title']]
user_ratings=user_ratings_final.sort_values(by='userId')
user_ratings.head()

Unnamed: 0,userId,movieId,rating,title
0,1,2105,4.0,American Pie
47,1,2294,2.0,Jay and Silent Bob Strike Back
396,2,165,3.0,Back to the Future Part II
2126,2,480,4.0,Monsoon Wedding
2400,2,497,3.0,The Green Mile


I randomly picked a user to look at his/her past movie ratings and make predictions using collabrative filtering I created.

In [57]:
user_ratings[user_ratings['userId'] == 2]

Unnamed: 0,userId,movieId,rating,title
396,2,165,3.0,Back to the Future Part II
2126,2,480,4.0,Monsoon Wedding
2400,2,497,3.0,The Green Mile
2460,2,500,4.0,Reservoir Dogs
2613,2,508,4.0,Love Actually
2699,2,509,4.0,Notting Hill
2777,2,539,3.0,Psycho
100,2,62,3.0,2001: A Space Odyssey
2902,2,550,3.0,Fight Club
2954,2,586,3.0,Wag the Dog


User 2 mostly rates action movies. Let's give our recommendation engine a movie to predict based on his/her past ratings.

In [58]:
# Identifying the movie id to predict
movie=df['title']=='The Fifth Element'
df[movie][['title','id']]

Unnamed: 0,title,id
322,The Fifth Element,18


In [59]:
svd.predict(2, 322, 3)

Prediction(uid=2, iid=322, r_ui=3, est=3.7593585620966308, details={'was_impossible': False})

Estimated rating for The Fifth Element is 3.76.

In [66]:
movie=df['title']=='The Notebook'
df[movie][['title','id']]

Unnamed: 0,title,id
1559,The Notebook,11036


In [67]:
svd.predict(2, 11036, 3)

Prediction(uid=2, iid=11036, r_ui=3, est=3.4433906189646724, details={'was_impossible': False})

Estimated rating for The Notebook is 3.44.

User 2 rates Action movie The Fifth Element higher than Romance movie The Notebook which makes sense if we look at his/her rating history. 

One great feature of this recommender system is that it doesn't care what metadata movie contains. It works purely on the basis of an assigned movie ID and tries to predict ratings based on how the other users have predicted the movie.

## Hybrid Recommender

Now, I will build a recommendation system combining features of content based and collabrative filtering.


<b>Input:</b> User ID and the Title of a Movie

<b>Output</b>: Similar movies sorted on the basis of expected ratings by that particular user.

In [72]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [149]:
id_map = pd.read_csv('/Users/Atabay/Desktop/Movie_Recommendation_System_data/Raw_data/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
id_map.columns = ['movieId', 'id']
id_map = id_map.merge(df[['title', 'id']], on='id').set_index('title')

In [150]:
id_map

Unnamed: 0_level_0,movieId,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,1,862.0
GoldenEye,10,710.0
The American President,11,9087.0
Nixon,14,10858.0
Cutthroat Island,15,1408.0
...,...,...
The Maid's Room,160440,278348.0
The Legend of Tarzan,160563,258489.0
The Purge: Election Year,160565,316727.0
Nerve,160954,328387.0


In [97]:
indices_map = id_map.set_index('id')

In [147]:
indices_map

Unnamed: 0_level_0,movieId
id,Unnamed: 1_level_1
862.0,1
710.0,10
9087.0,11
10858.0,14
1408.0,15
...,...
278348.0,160440
258489.0,160563
316727.0,160565
328387.0,160954


In [116]:
id_map.loc['Toy Story']['id']

862.0

In [107]:
indexes

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4797
Newlyweds                                   4798
Signed, Sealed, Delivered                   4799
Shanghai Calling                            4800
My Date with Drew                           4801
Length: 4802, dtype: int64

In [155]:
def hybrid(userId, title):
    idx = indexes[title]
    tmdbId = id_map.loc[title]['id']
    #print(idx)
    movie_id = id_map.loc[title]['movieId']
    
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, indices_map.loc[x]['movieId'], 3))
    movies = movies.sort_values('est', ascending=False)
    return movies.head(10)

In [156]:
hybrid(1, 'Avatar')

KeyError: 50357