# Importing the necessary libraries

In [229]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Reading the data

In [230]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [231]:
links = pd.read_csv("ml-latest-small/links.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")
tags = pd.read_csv("ml-latest-small/tags.csv")

The links dataset is useless for us because it's just a mapping between Ids in different databases:

In [232]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [233]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [234]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [235]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


# Building a content-based recommender

# Extracting features from genres
The simplest way to extraxt features from the genres column is using a TF-IDF Vectorizer:

In [236]:
genres_tfidf = TfidfVectorizer(token_pattern='[a-zA-Z0-9\-]+')

There are some values with *(no genres listed)* which I want to replace them with empty strings.

In [237]:
movies['genres'].replace(to_replace="(no genres listed)", value="", inplace=True)

Let's build the tf-idf vectorizer:

In [238]:
genres_tfidf_matrix = genres_tfidf.fit_transform(movies['genres']).todense()

## Using the NearestNeighours to get similar items

In [246]:
nbrs = NearestNeighbors(n_neighbors=10).fit(genres_tfidf_matrix)

## Creating user profiles

In [240]:
movieId_indices = pd.Series(movies.index, index=movies['movieId']).drop_duplicates()
def get_movie_index(mid):
  return movieId_indices[mid]

I create the user profile by taking the weighted average of movies the user has rated. The wights would be the ratings and the features would be the tf-idf features of the movies:

In [241]:
ratings['weighted_vec'] = ratings.apply(lambda x: 
                                        np.asarray(genres_tfidf_matrix[get_movie_index(x['movieId'])]).flatten() * x['rating'],
                                        axis=1)

In [242]:
user_profiles = ratings[['userId', 'weighted_vec']].groupby('userId').sum()
user_rating_sum = ratings.groupby('userId')['rating'].sum()
user_profiles['weighted_vec'] /= user_rating_sum

## Getting the recommendations

I'll get the top 10 nearest movies for the user that haven't been seen and sample 3 of them randomly:

In [248]:
def get_content_based_recom(uid, k=3):
  user_prof = user_profiles.iloc[uid]
  movie_list = get_movie_index(ratings[ratings["userId"]==uid]['movieId'])
  
  _, neigh = nbrs.kneighbors(user_prof['weighted_vec'].reshape(1, -1))
  neigh = np.array([i for i in neigh.flatten() if i not in movie_list])
  recom_index = np.random.choice(neigh, k)
  
  return set(movies.iloc[recom_index]['title'])

In [249]:
get_content_based_recom(1)

{'A Cosmic Christmas (1977)', 'Hyena Road', 'The Brand New Testament (2015)'}