In [None]:
import os
import boto3
import sagemaker
import numpy as np
import pandas as pd

## Data Cleaning

In [None]:
# read rating data
fpath = 'BX-Book-Ratings.csv'
df = pd.read_csv(fpath, delimiter = ";", error_bad_lines=False)

In [None]:
#filter out ISBNs containing 'X'
df= df[df.ISBN.apply(lambda x: x.isnumeric())]

In [None]:
#filter out any user with less than 20 book ratings
valid = df.groupby('User-ID').ISBN.nunique().to_frame()
valid = valid.rename(columns={"ISBN":"Freq"})
valid = valid[valid['Freq']>19]

In [None]:
#formatting and cut down to 2000 users
valid['User-ID'] = valid.index
valid = valid.reset_index(drop=True)
valid = valid[valid.index<2000]
valid = valid.drop('Freq', axis=1)

In [None]:
#merge with complete dataframe to get ISBNs and individual ratings
new_df = pd.merge(df, valid, how= 'inner')

In [None]:
#fix labesl
new_df = new_df.rename(columns={"User-ID":"userId"})
new_df= new_df.rename(columns={"Book-Rating":"bookRating"})

In [None]:
#fix type
final_df = new_df.astype('int64')
final_df.dtypes

## Training and Testing Sets, Negative Sampling

In [None]:
def train_test_split(df, holdout_num):
    """ perform training/testing split
    
    @param df: dataframe
    @param holdhout_num: number of items to be held out
    
    @return df_train: training data
    @return df_test testing data
    
    """
    # first sort the data by time
    #df = df.sort_values(['userId', 'timestamp'], ascending=[True, False])
    
    # perform deep copy on the dataframe to avoid modification on the original dataframe
    df_train = df.copy(deep=True)
    df_test = df.copy(deep=True)
    
    # get test set
    #df_test = df_test.groupby(['userId']).head(holdout_num).reset_index()
    df_test = df_test.groupby(['userId']).head(holdout_num)
    
    # get train set
    df_train = df_train.merge(
        df_test[['userId', 'ISBN']].assign(remove=1),
        how='left'
    ).query('remove != 1').drop('remove', 1).reset_index(drop=True)
    
    # sanity check to make sure we're not duplicating/losing data
    assert len(df) == len(df_train) + len(df_test)
    
    return df_train, df_test

In [87]:
#we're using 10 because we have a minimum of 20 ratings/user
df_train, df_test = train_test_split(new_df, 10)

In [None]:
def negative_sampling(user_ids, book_ids, items, n_neg):
    """This function creates n_neg negative labels for every positive label
    
    @param user_ids: list of user ids
    @param book_ids: list of movie ids
    @param items: unique list of movie ids
    @param n_neg: number of negative labels to sample
    
    @return df_neg: negative sample dataframe
    
    """
    
    neg = []
    ui_pairs = zip(user_ids, movie_ids)
    records = set(ui_pairs)
    
    # for every positive label case
    for (u, i) in records:
        # generate n_neg negative labels
        for _ in range(n_neg):
            # if the randomly sampled movie exists for that user
            j = np.random.choice(items)
            while(u, j) in records:
                # resample
                j = np.random.choice(items)
            neg.append([u, j, 0])
    # conver to pandas dataframe for concatenation later
    df_neg = pd.DataFrame(neg, columns=['userId', 'ISBN', 'bookRating'])
    
    return df_neg

In [None]:
# create negative samples for training set
neg_train = negative_sampling(
    user_ids=df_train.userId.values, 
    book_ids=df_train.ISBN.values,
    items=new_df.ISBN.unique(),
    n_neg=2
)

In [None]:
print(f'created {neg_train.shape[0]:,} negative samples')

In [None]:
#negative samples = 0, positive samples = 1
df_train = df_train[['userId', 'ISBN']].assign(bookRating=1)
df_test = df_test[['userId', 'ISBN']].assign(bookRating=1)

df_train = pd.concat([df_train, neg_train], ignore_index=True)

In [None]:
def get_unique_count(df):
    """calculate unique user and movie counts"""
    return df.userId.nunique(), df.ISBN.nunique()

In [None]:
print('training set shape', get_unique_count(df_train))
print('testing set shape', get_unique_count(df_test))

In [None]:
# number of unique user and number of unique item/movie
n_user, n_item = get_unique_count(df_train)

print("number of unique users", n_user)
print("number of unique items", n_item)

## Save Data

In [None]:
# save the variable for the model training notebook
# -----
%store n_user
%store n_item

In [None]:
# get current session region
session = boto3.session.Session()
region = session.region_name
print(f'currently in {region}')

In [None]:
# use the default sagemaker s3 bucket to store processed data
# here we figure out what that default bucket name is 
sagemaker_session = sagemaker.Session()
bucket_name = sagemaker_session.default_bucket()
print(bucket_name)  # bucket name format: "sagemaker-{region}-{aws_account_id}"

In [None]:
# save data locally first
dest = 'data/s3'
train_path = os.path.join(dest, 'train.npy')
test_path = os.path.join(dest, 'test.npy')

!mkdir {dest}
np.save(train_path, df_train.values, allow_pickle =False)
np.save(test_path, df_test.values, allow_pickle =False)

# upload to S3 bucket (see the bucket name above)
sagemaker_session.upload_data(train_path, key_prefix='data')
sagemaker_session.upload_data(test_path, key_prefix='data')