## Import packages

In [2]:
import os
import pandas as pd

## Download training data

In [3]:
! wget -q http://files.grouplens.org/datasets/movielens/ml-1m.zip
! unzip ml-1m.zip

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


## Define constants


In [4]:
BASE_DIR = '.' # Modify this if needed to the local directory that the MovieLens 1M Dataset has been unzipped into. 
MOVIELENS_DIR = BASE_DIR + '/ml-1m/'
USER_DATA_FILE = 'users.dat'
MOVIE_DATA_FILE = 'movies.dat'
RATING_DATA_FILE = 'ratings.dat'
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }
RATINGS_CSV_FILE = 'ml1m_ratings.csv'
USERS_CSV_FILE = 'ml1m_users.csv'
MOVIES_CSV_FILE = 'ml1m_movies.csv'

## Load MovieLens 1M data

The MovieLens 1M Dataset can be downloaded from http://files.grouplens.org/datasets/movielens/ml-1m.zip. We load the data about users, movies and ratings into dataframes. 

A few quirks in this dataset:

* The ids for users and movies are 1-based,
* not all movies have descriptions, and 
* not all movies are rated. 

To make it easy to simply use series from the ratings dataframe as training inputs and output to the Keras model, we do the following:

* We set max_userid to the maximum user id in the ratings,
* we set max_movieid to the maximum movie id in the ratings, and
* we add columns (user_emb_id and movie_emb_id) whose values are the user or movie ids minus one.

In [5]:
ratings = pd.read_csv(os.path.join(MOVIELENS_DIR, RATING_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userid', 'movieid', 'rating', 'timestamp'])
max_userid = ratings['userid'].drop_duplicates().max()
max_movieid = ratings['movieid'].drop_duplicates().max()
ratings['user_emb_id'] = ratings['userid'] - 1
ratings['movie_emb_id'] = ratings['movieid'] - 1
print len(ratings), 'ratings loaded'
ratings.to_csv(RATINGS_CSV_FILE, 
               sep='\t', 
               header=True, 
               encoding='latin-1', 
               columns=['userid', 'movieid', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])
print 'Saved to', RATINGS_CSV_FILE

1000209 ratings loaded
Saved to ml1m_ratings.csv


In [6]:
users = pd.read_csv(os.path.join(MOVIELENS_DIR, USER_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['userid', 'gender', 'age', 'occupation', 'zipcode'])
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print len(users), 'descriptions of', max_userid, 'users loaded.'
users.to_csv(USERS_CSV_FILE, 
             sep='\t', 
             header=True, 
             encoding='latin-1',
             columns=['userid', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])
print 'Saved to', USERS_CSV_FILE

6040 descriptions of 6040 users loaded.
Saved to ml1m_users.csv


In [7]:
movies = pd.read_csv(os.path.join(MOVIELENS_DIR, MOVIE_DATA_FILE), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movieid', 'title', 'genre'])
print len(movies), 'descriptions of', max_movieid, 'movies loaded.'
movies.to_csv(MOVIES_CSV_FILE, 
              sep='\t', 
              header=True, 
              columns=['movieid', 'title', 'genre'])
print 'Saved to', MOVIES_CSV_FILE

3883 descriptions of 3952 movies loaded.
Saved to ml1m_movies.csv


In [8]:
print len(ratings['userid'].drop_duplicates()), 'of the', max_userid, 'users rate at least one movie.'

6040 of the 6040 users rate at least one movie.


In [9]:
print len(ratings['movieid'].drop_duplicates()), 'of the', max_movieid, 'movies are rated.'

3706 of the 3952 movies are rated.
