
___
# Recommender Systems with Python


In [1]:
import numpy as np
import pandas as pd

## Get the Data

In [2]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=column_names)

In [None]:
df.head()

In [None]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()

We can merge them together:

In [None]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

# EDA


## Visualization Imports

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
%matplotlib inline

create a ratings dataframe with average rating and number of ratings:

In [None]:
df.groupby('title')['rating'].mean().sort_values(ascending=False).head()

In [None]:
df.groupby('title')['rating'].count().sort_values(ascending=False).head()

In [None]:
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings.head()

Now set the number of ratings column:

In [None]:
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head()

Now a few histograms:

In [None]:
plt.figure(figsize=(10,4))
ratings['num of ratings'].hist(bins=70)

In [None]:
plt.figure(figsize=(10,4))
ratings['rating'].hist(bins=70)

In [None]:
sns.jointplot(x='rating',y='num of ratings',data=ratings,alpha=0.5)

## Recommending Similar Movies

In [None]:
moviemat = df.pivot_table(index='user_id',columns='title',values='rating')
moviemat.head()

Most rated movie:

In [None]:
ratings.sort_values('num of ratings',ascending=False).head(10)

Choose two movies

In [None]:
ratings.head()

Now let's grab the user ratings for those two movies:

In [None]:
starwars_user_ratings = moviemat['Star Wars (1977)']
liarliar_user_ratings = moviemat['Liar Liar (1997)']
starwars_user_ratings.head()

We can then use corrwith() method to get correlations between two pandas series:

In [None]:
similar_to_starwars = moviemat.corrwith(starwars_user_ratings)
similar_to_liarliar = moviemat.corrwith(liarliar_user_ratings)

Let's clean this by removing NaN values and using a DataFrame instead of a series:

In [None]:
corr_starwars = pd.DataFrame(similar_to_starwars,columns=['Correlation'])
corr_starwars.dropna(inplace=True)
corr_starwars.head()

Now if we sort the dataframe by correlation, we should get the most similar movies, however note that we get some results that don't really make sense. This is because there are a lot of movies only watched once by users who also watched star wars

In [None]:
corr_starwars.sort_values('Correlation',ascending=False).head(10)

Let's fix this by filtering out movies that have less than 100 reviews (this value was chosen based off the histogram from earlier).

In [None]:
corr_starwars = corr_starwars.join(ratings['num of ratings'])
corr_starwars.head()

Now sort the values and notice how the titles make a lot more sense:

In [None]:
corr_starwars[corr_starwars['num of ratings']>100].sort_values('Correlation',ascending=False).head()

Now the same for the comedy Liar Liar:

In [None]:
corr_liarliar = pd.DataFrame(similar_to_liarliar,columns=['Correlation'])
corr_liarliar.dropna(inplace=True)
corr_liarliar = corr_liarliar.join(ratings['num of ratings'])
corr_liarliar[corr_liarliar['num of ratings']>100].sort_values('Correlation',ascending=False).head()