# Scratchpad for recommenders

In [70]:
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from scipy.sparse import csr_matrix, save_npz, load_npz

In [20]:
df = pd.read_csv("data/rating.csv")
df.shape

(20000263, 4)

In [5]:
df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24
20000262,138493,71619,2.5,2009-10-17 20:25:36


In [15]:
# is userId sequential? Yes
len(set(df.userId))

138493

In [14]:
# is movieId sequential? no there are holes
print(len(set(df.movieId)))

print(sorted(set(df.movieId))[12200:12220])

26744
[55757, 55765, 55768, 55782, 55805, 55814, 55820, 55826, 55830, 55844, 55851, 55854, 55856, 55872, 55875, 55888, 55895, 55901, 55908, 55926]


In [9]:
df.rating.value_counts()

4.0    5561926
3.0    4291193
5.0    2898660
3.5    2200156
4.5    1534824
2.0    1430997
2.5     883398
1.0     680732
1.5     279252
0.5     239125
Name: rating, dtype: int64

### Preprocess

In [21]:
# prepare preprocessed dataset

# drop axis
df.drop(["timestamp"], axis=1, inplace=True)

# make userId 0-indexed
df['userId'] = df['userId'] - 1

# map movieId to a new continoues movie ID
oldToNewMap = {}
oldIds = set(df.movieId.values)
newId = 0
for oldId in oldIds:
    oldToNewMap[oldId] = newId
    newId += 1
    
df["newMovieId"] = df.movieId.apply(lambda x: oldToNewMap[x])

In [22]:
df.head()

Unnamed: 0,userId,movieId,rating,newMovieId
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [35]:
df.to_csv('data/preprocessed_rating.csv', index=False)

### Make small dataset for on-premise workload

In [3]:
preprocessed = pd.read_csv("data/preprocessed_rating.csv")
preprocessed.head()

Unnamed: 0,userId,movieId,rating,newMovieId
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [4]:
userIdCount = Counter(preprocessed.userId)
movieIdCount = Counter(preprocessed.newMovieId)

# number of users and movies to keep
n = 10000
m = 2000

topUserIds = [k for k, v in userIdCount.most_common(n)]
topMovieIds = [k for k, v in movieIdCount.most_common(m)]

preprocessed_small = preprocessed[preprocessed.userId.isin(topUserIds) & preprocessed.movieId.isin(topMovieIds)].copy()

In [5]:
preprocessed_small.shape

(2206428, 4)

In [6]:
# ensure sequential
def make_sequential(df, col, newCol):
    old_to_new_map = {}
    old_ids = set(df[col])
    new_id = 0
    for old_id in old_ids:
        old_to_new_map[old_id] = new_id
        new_id += 1
    df[newCol] = df[col].apply(lambda x: old_to_new_map[x])
    return df, old_to_new_map

preprocessed_small, user_id_map = make_sequential(preprocessed_small, 'userId', 'SmallUserId')
preprocessed_small, movie_id_map = make_sequential(preprocessed_small, 'newMovieId', 'SmallNewMovieId')

In [7]:
preprocessed_small.reset_index(drop=True, inplace=True)

In [10]:
print(preprocessed_small.SmallUserId.max())
print(preprocessed_small.SmallNewMovieId.max())

9999
1705


In [11]:
preprocessed_small.head()

Unnamed: 0,userId,movieId,rating,newMovieId,SmallUserId,SmallNewMovieId
0,10,1,4.5,1,5,0
1,10,10,2.5,10,5,8
2,10,19,3.5,19,5,16
3,10,32,5.0,32,5,27
4,10,39,4.5,39,5,30


In [29]:
preprocessed_small.to_csv('data/small_preprocessed_rating.csv', index=False)

### Preprocess to dictionary

In [30]:
small_preprocessed_rating = pd.read_csv('data/small_preprocessed_rating.csv')

In [31]:
small_preprocessed_rating.head()

Unnamed: 0,userId,movieId,rating,newMovieId,SmallUserId,SmallNewMovieId
0,10,1,4.5,1,5,0
1,10,10,2.5,10,5,8
2,10,19,3.5,19,5,16
3,10,32,5.0,32,5,27
4,10,39,4.5,39,5,30


In [35]:
# split into train and test
small_preprocessed_rating = shuffle(small_preprocessed_rating)
cutoff = int(0.8*len(small_preprocessed_rating))
df_train = small_preprocessed_rating.iloc[:cutoff]
df_test = small_preprocessed_rating.iloc[cutoff:]

In [39]:
# Training

user_to_movie = {}
movie_to_user = {}
userMovie_to_rating = {}

def initialize_train_dicts(row, user_id_col, movie_id_col):
    user_id = row[user_id_col]
    movie_id = row[movie_id_col]
    rating = row['rating']
    
    if user_id not in user_to_movie:
        user_to_movie[user_id] = [movie_id]
    else:
        user_to_movie[user_id].append(movie_id)
    
    if movie_id not in movie_to_user:
        movie_to_user[movie_id] = [user_id]
    else:
        movie_to_user[movie_id].append(user_id)
    
    userMovie_to_rating[(user_id, movie_id)] = rating
    return 

_ = df_train.apply(lambda row: initialize_train_dicts(row, 'SmallUserId', 'SmallNewMovieId'), axis=1)

In [42]:
# Testing

userMovie_to_rating_test = {}

def initialize_test_dicts(row, user_id_col, movie_id_col):
    user_id = row[user_id_col]
    movie_id = row[movie_id_col]
    rating = row['rating']
    
    userMovie_to_rating_test[(user_id, movie_id)] = rating
    return 

_ = df_test.apply(lambda row: initialize_test_dicts(row, 'SmallUserId', 'SmallNewMovieId'), axis=1)

In [44]:
# save to pickled files
with open('data/user_to_movie.json', 'wb') as f:
    pickle.dump(user_to_movie, f)
    
with open('data/movie_to_user.json', 'wb') as f:
    pickle.dump(movie_to_user, f)
    
with open('data/userMovie_to_rating.json', 'wb') as f:
    pickle.dump(userMovie_to_rating, f)
    
with open('data/userMovie_to_rating_test.json', 'wb') as f:
    pickle.dump(userMovie_to_rating_test, f)

### Preprocess to Sparse Matrix

In [46]:
small_preprocessed_rating = pd.read_csv('data/small_preprocessed_rating.csv')
small_preprocessed_rating.head()

Unnamed: 0,userId,movieId,rating,newMovieId,SmallUserId,SmallNewMovieId
0,10,1,4.5,1,5,0
1,10,10,2.5,10,5,8
2,10,19,3.5,19,5,16
3,10,32,5.0,32,5,27
4,10,39,4.5,39,5,30


In [65]:
# Split into Train and Test

N = df_train.SmallUserId.max() + 1
M = df_train.SmallNewMovieId.max() + 1

small_preprocessed_rating = shuffle(small_preprocessed_rating)
cutoff = int(0.8*len(small_preprocessed_rating))
df_train = small_preprocessed_rating.iloc[:cutoff]
df_test = small_preprocessed_rating.iloc[cutoff:]

In [66]:
# Training Sparse Matrix
small_user_ids = df_train.SmallUserId.values
small_movie_ids = df_train.SmallNewMovieId.values
small_ratings = df_train.rating.values
user_movie_matrix = csr_matrix((small_ratings, (small_user_ids, small_movie_ids)), shape=(N, M))

In [67]:
# Testing Sparse Matrix
small_user_ids_test = df_test.SmallUserId.values
small_movie_ids_test = df_test.SmallNewMovieId.values
small_ratings_test = df_test.rating.values
user_movie_matrix_test = csr_matrix((small_ratings_test, (small_user_ids_test, small_movie_ids_test)), shape=(N, M))

In [71]:
# save sparse matrices

save_npz('data/user_movie_matrix_train.npz', user_movie_matrix)

save_npz('data/user_movie_matrix_test.npz', user_movie_matrix_test)