In [1]:
import program.app_modules as rs
import os
import pandas as pd
import numpy as np

# url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Wireless_v1_00.tsv.gz"
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_00.tsv.gz"
# url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Musical_Instruments_v1_00.tsv.gz"

filename = url.split('/')[-1]
workingdir = os.path.abspath('')
datadir = "program/data" # Directory where data is stored
filepath_tsvgz = os.path.join(workingdir, datadir, filename)
filepath_tsv = filepath_tsvgz[0:-3]

print("creating dataframe")

usecols = ['customer_id','product_id','star_rating',]

df = pd.read_csv(filepath_tsvgz,\
                delimiter='\t',\
                encoding="utf-8",\
                error_bad_lines=False,\
                compression='gzip',\
                # dtype=dtype,\
                usecols=usecols)
# Clean dataframe
df = df[df['star_rating'].isin([0,1,2,3,4,5])]
df.astype({'star_rating':np.int8})
df.astype({'customer_id':np.int8})
df.astype({'star_rating':np.int8})

print("created successfully")
print(df.head())

creating dataframe


  interactivity=interactivity, compiler=compiler, result=result)


created successfully
        customer_id  product_id star_rating
131072     52158092  1577311310           5
131073      2401115  B004S7VSQ6           5
131074     11978442  0547053649           5
131075     35344608  1846703395           5
131076     42078536  1508598592           5


In [8]:
# Clean dataframe
df = df[df['star_rating'].isin([0,1,2,3,4,5])]
df.astype({'star_rating':int})
df.astype({'customer_id':str})
df.astype({'product_id':str})

Unnamed: 0,customer_id,product_id,star_rating
131072,52158092,1577311310,5
131073,2401115,B004S7VSQ6,5
131074,11978442,0547053649,5
131075,35344608,1846703395,5
131076,42078536,1508598592,5
131077,42865177,1476728747,5
131078,11954372,1416592377,5
131079,52632594,178208259X,5
131080,28972524,0385337612,5
131081,10436571,088368280X,5


In [9]:
from scipy.sparse import csr_matrix, save_npz, load_npz # Sparse matrix
from sklearn.neighbors import NearestNeighbors

# Calculate total number of products and customers (for range calculation)
print("success 1")
prodNo = len(set(df["product_id"])) # Number of products
custNo = len(set(df["customer_id"])) # Number of customers
print("Number of unique products: {}".format(prodNo))
print("Number of unique customers: {}".format(custNo))

# Get unique product and customer IDs and index them
print("success 2")
prodUnique_indexed = dict(zip(np.unique(df["product_id"]), list(range(prodNo))))
custUnique_indexed = dict(zip(np.unique(df["customer_id"]), list(range(custNo))))
prodUnique_reverseIndexed = dict(zip(list(range(prodNo)), np.unique(df["product_id"])))
custUnique_reverseIndexed = dict(zip(list(range(custNo)), np.unique(df["customer_id"])))

# Finally go through each row of dataframe and assign index 
print("success 3")
custIndex = [custUnique_indexed[i] for i in df["customer_id"]]
prodIndex = [prodUnique_indexed[i] for i in df["product_id"]]

# Create indexed matrix
print("success 4")
df_custIndex = pd.DataFrame(custIndex, columns=["custIndex"])
df_prodIndex = pd.DataFrame(prodIndex, columns=["prodIndex"])
df_indexed = pd.DataFrame(df["star_rating"], columns=["star_rating"])
df_indexed.insert(0, "prodIndex", df_prodIndex)
df_indexed.insert(0, "custIndex", df_custIndex)


# Create the sparse matrix based on a matrix with dimensions:
# (no. of unique products x no. of unique customres).
# Description from official scipy documentation:
# csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
# where data, row_ind and col_ind satisfy the relationship a[row_ind[k], col_ind[k]] = data[k].

print("...creating sparse matrix...")
df_csr = csr_matrix((df_indexed["star_rating"], (prodIndex, custIndex)), shape=(prodNo, custNo))

msg = "\nFitted the KNN model successfully."

success 1
Number of unique products: 2015314
Number of unique customers: 3910621
success 2
success 3
success 4
...creating sparse matrix...


TypeError: no supported conversion for types: (dtype('O'),)