## Movie Recommendation System

In [5]:
from __future__ import print_function, division
from builtins import range, input

import pickle
import numpy as np
import pandas as pd
from collections import Counter

### Data Preprocessing
+ Note: user ids are ordered sequentially from 1..138493 with no missing numbers, movie ids are integers from 1..131262 and NOT all movie ids appear, there are only 26744 movie ids

In [2]:
df = pd.read_csv('movielens-20m-dataset/rating.csv')

In [3]:
# make the user ids go from 0...N-1
df.userId = df.userId - 1

# create a mapping for movie ids
unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1

# add them to the data frame
# takes awhile
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

df = df.drop(columns=['timestamp']) # we don't need timestamp data

In [4]:
df.to_csv('movielens-20m-dataset/edited_rating.csv', index=False)

### shrinking data 
+ full dataset is too large to perform O(n^2) algorithm 
+ select subset of users and moives, users who rated the most movies, movies who've been rated by the most users

In [7]:
df = pd.read_csv('movielens-20m-dataset/edited_rating.csv')
print("original dataframe size:", len(df))

original dataframe size: 20000263


In [8]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies

user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)

# number of users and movies we would like to keep
n = 10000
m = 2000

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]

# make a copy, otherwise ids won't be overwritten
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

df_small.head()

Unnamed: 0,userId,movieId,rating,movie_idx
960,10,1,4.5,1
961,10,10,2.5,10
962,10,19,3.5,19
963,10,32,5.0,32
964,10,39,4.5,39


In [9]:
# need to remake user ids and movie ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)

i: 10000
j: 2000


In [10]:
print("Setting new ids")
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)
# df_small.drop(columns=['userId', 'movie_idx'])
# df_small.rename(index=str, columns={'new_userId': 'userId', 'new_movie_idx': 'movie_idx'})
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())
print("small dataframe size:", len(df_small))

Setting new ids
max user id: 9999
max movie id: 1999


In [None]:
df_small.to_csv('movielens-20m-dataset/small_rating.csv', index=False)