In [1]:
import numpy as np
import pandas as pd

In [2]:
# load the data

# using ratings.csv from https://www.kaggle.com/grouplens/movielens-20m-dataset

df = pd.read_csv('../data/rating.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [3]:
df.shape

(20000263, 4)

In [4]:
# check for nan
df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [5]:
# timestamp isn't going to be used in our algorithm
df = df.drop('timestamp' , axis = 1)
df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [6]:
df = df.sort_values(by = ['userId' , 'movieId'])
df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


### do df.userId and df.movieId contain missing Ids?

In [7]:
# for userId
print('min user id is ', np.min(df['userId']))
print('max user id is ', np.max(df['userId']))

# find the total number of unique users
print('len of userId' , len(np.unique(df['userId'])))

min user id is  1
max user id is  138493
len of userId 138493


In [8]:
# for movieId
print('min user id is ', np.min(df['movieId']))
print('max user id is ', np.max(df['movieId']))

# find the total number of unique users
print('len of userId' , len(np.unique(df['movieId'])))

min user id is  1
max user id is  131262
len of userId 26744


user Id is fully does not contain missing Ids and movieId does contain missing Ids

for userId:
remove 1 from each  value to start at zero

for movieId:
make values and start from zero

In [9]:
# remove 1 from all users
df.userId = df.userId -1
df.head()

Unnamed: 0,userId,movieId,rating
0,0,2,3.5
1,0,29,3.5
2,0,32,3.5
3,0,47,3.5
4,0,50,3.5


In [10]:
# saturate movieId
unique_movies = np.unique(df.movieId)
unique_movies

array([     1,      2,      3, ..., 131258, 131260, 131262])

In [11]:
def create_map(arr):
    my_map = {}
    for i , j in enumerate(arr):
        my_map[j] = i
    return my_map


In [12]:
# numpy unique already as ordered the data

# saturate the data

# map of old movieId to new movieId
movies_map = create_map(unique_movies)
print(movies_map[1])

0


In [13]:
df.movieId = df.movieId.map(movies_map)
df.head()

Unnamed: 0,userId,movieId,rating
0,0,1,3.5
1,0,28,3.5
2,0,31,3.5
3,0,46,3.5
4,0,49,3.5


In [14]:
# integrety check on movieId
print('min movieId is' , df.movieId.min())
print('max movieId is' , df.movieId.max())

print(len(unique_movies))

min movieId is 0
max movieId is 26743
26744


## shrink the data set

In [15]:
# num_of_users
N = 1000

# num of movies
M = 1000

# since algorithm is very complex O(N^2 M) we need a small amount of users to reduce training time

lets find the 2000 most popular users and the two most popular movies

In [16]:
# 1000 most popular users
from collections import Counter

In [17]:
# we want the most common for movie and users
def most_common(arr , n):
    """ takes a list/array and returns the n most common values"""
    return [i[0] for i in Counter(arr).most_common(n)]

users_to_keep = most_common(df['userId'] , N)
movies_to_keep = most_common(df['movieId'] , M)

In [18]:
df = df[(df.userId.isin(users_to_keep) & df.movieId.isin(movies_to_keep))]
df.head()

Unnamed: 0,userId,movieId,rating
19846,155,0,5.0
19847,155,1,5.0
19848,155,2,2.0
19850,155,4,3.0
19851,155,5,4.0


In [19]:
# new size of our data set
df.shape

(627113, 3)

In [20]:
df = df.sort_values(by = ['userId' , 'movieId'])
df.head()

Unnamed: 0,userId,movieId,rating
19846,155,0,5.0
19847,155,1,5.0
19848,155,2,2.0
19850,155,4,3.0
19851,155,5,4.0


In [21]:
unique_users = np.unique(df.userId)
unique_movies = np.unique(df.movieId)

In [22]:
user_map = create_map(unique_users)
movie_map = create_map(unique_movies)

df.userId = df.userId.map(user_map)
df.movieId = df.movieId.map(movie_map)

df.head()


Unnamed: 0,userId,movieId,rating
19846,0,0,5.0
19847,0,1,5.0
19848,0,2,2.0
19850,0,3,3.0
19851,0,4,4.0


In [23]:
# data integrity for userId
print(df.userId.min())
print(df.userId.max())
print(len(np.unique(df.userId)))

# data integrity for movieId
print('\n' , df.movieId.min())
print(df.movieId.max())
print(len(np.unique(df.movieId)))



0
999
1000

 0
999
1000


In [24]:
from sklearn.model_selection import train_test_split

In [25]:
train , test = train_test_split(df , test_size = 0.2 , random_state = 42 , stratify = df['userId'])

In [26]:
train.shape , test.shape

((501690, 3), (125423, 3))

In [27]:
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}

# a dictionary to tell us which users have rated which movies test
user2movie_test = {}
# a dicationary to tell us which movies have been rated by which users test
movie2user_test = {}
# a dictionary to look up ratings test
usermovie2rating_test = {}



# test ratings dictionary
usermovie2rating_test = {}




In [28]:
for i , j , k in zip(train.userId , train.movieId , train.rating):
    
    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)
    
    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)
    
    usermovie2rating[(i , j)] = k


In [29]:
for i , j , k in zip(test.userId , test.movieId , test.rating):
    
    if i not in user2movie_test:
        user2movie_test[i] = [j]
    else:
        user2movie_test[i].append(j)
    
    if j not in movie2user_test:
        movie2user_test[j] = [i]
    else:
        movie2user_test[j].append(i)
    
    
    usermovie2rating_test[(i , j)] = k

In [30]:
import pickle

In [31]:
with open('../data/user2movie.pkl', 'wb') as f:
  pickle.dump(user2movie, f)

with open('../data/movie2user.pkl', 'wb') as f:
  pickle.dump(movie2user, f)

with open('../data/usermovie2rating.pkl', 'wb') as f:
  pickle.dump(usermovie2rating, f)

# test

with open('../data/user2movie_test.pkl', 'wb') as f:
  pickle.dump(user2movie_test, f)

with open('../data/movie2user_test.pkl', 'wb') as f:
  pickle.dump(movie2user_test, f)


with open('../data/usermovie2rating_test.pkl', 'wb') as f:
  pickle.dump(usermovie2rating_test, f)


save the train and test df to .csv_file as this could be useful as well

In [32]:
train.to_csv('../data/train.csv' , index= False)

In [33]:
test.to_csv('../data/test.csv'  , index= False)