# Movie Recommender EDA with Smaller Dataset

author: Ben Sturm <br />
contact: bwsturm@gmail.com <br />
date: 6/16/2018

In [26]:
import pandas as pd
import numpy as np

### Data Extraction Steps

In [80]:
# Reading in the MovieLens 100K ratings dataset
ratings = pd.read_csv('../../data/ml-latest-small/ratings.csv')

In [81]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


movies = pd.read_csv('../../data/ml-latest-small/movies.csv')

In [83]:
movies.tail()

Unnamed: 0,movieId,title,genres
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy
9124,164979,"Women of '69, Unboxed",Documentary


In [84]:
# Now I'm going to pivot my dataframe to get it into a matrix Y of size nu x nm
# nu = number of users
# nm = number of movies

Y = pd.pivot_table(ratings, values='rating', index='movieId', columns='userId').fillna(0)

In [85]:
Y.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,4.0,0.0,...,0.0,4.0,3.5,0.0,0.0,0.0,0.0,0.0,4.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
# I also want to create my R matrix, where R(i,j) = 1 if user j gave a rating to movie i
# R has the same size as Y

R = Y.copy()
R[R > 0] = 1
R = R.astype(int)
R.head()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,1,0,1,0,...,0,1,1,0,0,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


What percentage of movies have 5 or more ratings?

In [87]:
nm, nu = Y.shape
temp= R.sum(axis=1)
idx_5 = temp>=5
print('The fraction of movies with 5 or more unique ratings is {:.3f}'.format(sum(idx_5)/nm))
idx_10 = temp>=10 
print('The fraction of movies with 10 or more unique ratings is {:.3f}'.format(sum(idx_10)/nm))

The fraction of movies with 5 or more unique ratings is 0.386
The fraction of movies with 10 or more unique ratings is 0.248


I'm going to filter out the movies with less than 5 ratings.

In [95]:
Y = Y[idx_5]
Y.tail()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
142488,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
R = R[idx_5]
R.tail()

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
142488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
146656,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
148626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
152077,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
152081,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The next step is to convert Y and R into a numpy array.  That is easy to do, however, my movieId is going to get lost.  So, what I need to do is to map movieID to a new index and then make sure that same index is mapped to my movies DataFrame.

In [97]:
movies.tail()

Unnamed: 0,movieId,title,genres
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary
9123,164977,The Gay Desperado (1936),Comedy
9124,164979,"Women of '69, Unboxed",Documentary


In [98]:
Movie_mapping_df = pd.DataFrame(Y.index)
Movie_mapping_df.reset_index(inplace=True)

In [99]:
Movie_mapping_df.tail()

Unnamed: 0,index,movieId
3491,3491,142488
3492,3492,146656
3493,3493,148626
3494,3494,152077
3495,3495,152081


Now I will add this mapping of movieId to index to my movies DataFrame

In [100]:
movies = pd.merge(movies,Movie_mapping_df,on='movieId')
movies.tail()

Unnamed: 0,movieId,title,genres,index
3491,142488,Spotlight (2015),Thriller,3491
3492,146656,Creed (2015),Drama,3492
3493,148626,"Big Short, The (2015)",Drama,3493
3494,152077,10 Cloverfield Lane (2016),Thriller,3494
3495,152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy,3495


In [101]:
movies=movies[['index','movieId','title','genres']]
movies.head()

Unnamed: 0,index,movieId,title,genres
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,Jumanji (1995),Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,5,Father of the Bride Part II (1995),Comedy


Okay, now it's time for me to save Y and R into a numpy array.

In [102]:
Y_new = Y.as_matrix()

In [103]:
R_new = R.as_matrix()

Now we can save our Y_new, R_new, Theta, and X numpy arrays into a dictionary and then save that to disk for posterity.

In [104]:
dct = {'Y': Y_new, 'R': R_new}

outfile = 'Movie_data.npz'
np.savez(outfile, **dct)

I'm also going to save the movies DataFrame into a csv-file.

In [105]:
movies.to_csv('movie_ids.csv',index=False)

In [47]:
Y_new

array([[0., 0., 0., ..., 0., 4., 5.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
movies.loc[1806,:]

index                     1806
movieId                   2284
title      Bandit Queen (1994)
genres                   Drama
Name: 1806, dtype: object

In [58]:
Y.iloc[1806,:].value_counts()

0.0    670
5.0      1
Name: 2284, dtype: int64

In [61]:
temp= R.sum(axis=1)

In [70]:
idx = temp>=5
sum(idx)

3496