In [1]:
import program.app_modules as rs
import os
import pandas as pd
import numpy as np

# url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Wireless_v1_00.tsv.gz"
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz"
# url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"

filename = url.split('/')[-1]
workingdir = os.path.abspath('')
datadir = "program/data" # Directory where data is stored
filepath_tsvgz = os.path.join(workingdir, datadir, filename)
filepath_tsv = filepath_tsvgz[0:-3]

print("creating dataframe")


def conv(val):
    if not val:
        return 0    
    try:
        return np.int8(val)
    except:        
        return np.int8(0)

usecols = ['customer_id','product_id','star_rating']
dtype = {'customer_id':str, 'product_id':str, 'star_rating':int}

df = pd.read_csv(filepath_tsvgz,\
                delimiter='\t',\
                encoding="utf-8",\
                error_bad_lines=False,\
                compression='gzip',\
                #dtype=dtype,\
                usecols=usecols,\
                #low_memory=False,\
                converters={'star_rating':conv})
# Clean dataframe
df = df[df['star_rating'].isin([0,1,2,3,4,5])]
df.astype({'star_rating':np.int8})

print("created successfully")
print(df.head())

creating dataframe
created successfully
   customer_id  product_id  star_rating
0     25933450  0439873800            5
1      1801372  1623953553            5
2      5782091  142151981X            5
3     32715830  014241543X            5
4     14005703  1604600527            5


In [32]:
# Clean dataframe
#df = df[df['star_rating'].isin([0,1,2,3,4,5])]
#df.astype({'star_rating':np.int8})
#df.astype({'customer_id':np.str})
#df.astype({'product_id':np.str})


from numpy.lib.format import open_memmap

filename="temp_np.npy"

# initialize an empty 10TB memory-mapped array
x5 = open_memmap(filename, mode='w+', dtype=np.ubyte, shape=(10**6,))
print("done")

OSError: [Errno 22] Invalid argument: 'temp_np.npy'

In [26]:
x1[:] = df['customer_id']

ValueError: could not broadcast input array from shape (10237176) into shape (1000000)

In [None]:
from numpy.lib.format import open_memmap

df_mem = np.memmap(filename, dtype='float32', mode='w+', shape=(10**10,))


In [2]:
from scipy.sparse import csr_matrix, save_npz, load_npz # Sparse matrix
from sklearn.neighbors import NearestNeighbors

# Calculate total number of products and customers (for range calculation)
print("success 1")
prodNo = len(set(df["product_id"])) # Number of products
custNo = len(set(df["customer_id"])) # Number of customers
print("Number of unique products: {}".format(prodNo))
print("Number of unique customers: {}".format(custNo))

# Get unique product and customer IDs and index them
print("success 2")
prodUnique_indexed = dict(zip(np.unique(df["product_id"]), list(range(prodNo))))
custUnique_indexed = dict(zip(np.unique(df["customer_id"]), list(range(custNo))))
prodUnique_reverseIndexed = dict(zip(list(range(prodNo)), np.unique(df["product_id"])))
custUnique_reverseIndexed = dict(zip(list(range(custNo)), np.unique(df["customer_id"])))
print("success 2.1")

# Finally go through each row of dataframe and assign index 
print("success 3")
custIndex = [custUnique_indexed[i] for i in df["customer_id"]]
prodIndex = [prodUnique_indexed[i] for i in df["product_id"]]
print("success 3.1")

# Create indexed matrix
print("success 4")
df_custIndex = pd.DataFrame(custIndex, columns=["custIndex"])
df_prodIndex = pd.DataFrame(prodIndex, columns=["prodIndex"])
df_indexed = pd.DataFrame(df["star_rating"], columns=["star_rating"])
df_indexed.insert(0, "prodIndex", df_prodIndex)
df_indexed.insert(0, "custIndex", df_custIndex)
print("success 4.1")

# Create the sparse matrix based on a matrix with dimensions:
# (no. of unique products x no. of unique customres).
# Description from official scipy documentation:
# csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
# where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k].

print("...creating sparse matrix...")
df_csr = csr_matrix((df_indexed["star_rating"], (prodIndex, custIndex)), shape=(prodNo, custNo))

msg = "\nFitted the KNN model successfully."

success 1
Number of unique products: 2264791
Number of unique customers: 4608176
success 2
success 2.1
success 3
success 3.1
success 4
success 4.1
...creating sparse matrix...
