In [1]:
!pip install surprise
!pip install sklearn

You should consider upgrading via the '/Users/kevindeloria/.pyenv/versions/3.9.0/bin/python3.9 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/kevindeloria/.pyenv/versions/3.9.0/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

# Data Set

In [None]:
ratings = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/ratings.csv")
ratings.head(20)

In [None]:
movies = pd.read_csv("https://s3-us-west-2.amazonaws.com/recommender-tutorial/movies.csv")
movies.head(10)

# Exploratory Data Analysis

In [None]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

sns.set_style("whitegrid")
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
ax = sns.countplot(x="rating", data=ratings, palette="viridis")
plt.title("Distribution of movie ratings")

plt.subplot(1,2,2)
ax = sns.kdeplot(user_freq['n_ratings'], shade=True, legend=False)
plt.axvline(user_freq['n_ratings'].mean(), color="k", linestyle="--")
plt.xlabel("# ratings per user")
plt.ylabel("density")
plt.title("Number of movies rated per user")
plt.show()

Rating is left skewed, movie ratings per user is right skewed. Data is sparse.

# Surprise! Training - 5 folds

In [None]:
ratings = ratings[['userId', 'movieId', 'rating']]

ratings.head(30)

In [None]:
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset, SVD

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

algo = SVD()

%time cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
%time algo.fit(data.build_full_trainset())

# Check User 1 / Movie 47

In [None]:
g = ratings[(ratings['userId'] == 1) & (ratings['movieId'] == 47)]
g

In [None]:
uid = 1  # raw user id (as in the ratings file)
iid = 47  # raw item id (as in the ratings file)
actual_r = float(g['rating'])

# get a prediction for specific user, movie id, and the expected rating
pred = algo.predict(uid, iid, r_ui=actual_r, verbose=True)

# Addressing cold start

https://github.com/topspinj/recommender-tutorial/blob/master/part-2-cold-start-problem.ipynb

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.head()

In [None]:
from collections import Counter

genres_counts = Counter(g for genres in movies['genres'] for g in genres)
print(f"There are {len(genres_counts)} genre labels.")

movies = movies[movies['genres']!='(no genres listed)']

del genres_counts['(no genres listed)']

genres_counts

In [None]:
genres_counts_df = pd.DataFrame([genres_counts]).T.reset_index()
genres_counts_df.columns = ['genres', 'count']
genres_counts_df = genres_counts_df.sort_values(by='count', ascending=False)

plt.figure(figsize=(10,5))
sns.barplot(x='genres', y='count', data=genres_counts_df, palette='viridis')
plt.xticks(rotation=90)
plt.show()