# Amazon Reviews data
#### Reviews grouped into 35 categories (span: May 1996 to Sept. 2023)
### Kingsley Kometa

In [59]:
import pandas as pd
import re
import json
import random
from time import time
import requests
from bs4 import BeautifulSoup
from os.path import join, getsize
from os import listdir

### Scrabe reviews urls

In [60]:
url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories'
response = requests.get(url)
response.status_code

200

In [61]:
soup = BeautifulSoup(response.text, 'html.parser')

In [62]:
# Extract urls corresponding to review data files
filenames = [join(url, a.text) for a in soup.find_all('a') \
                                if a.text.find('.jsonl.gz')!=-1]
print(len(filenames))

34


In [63]:
# Save this urls in a text file: reviews.txt
if 'amazonReviews' not in listdir():
    !mkdir 'amazonReviews'

def createTextFile(filename, fromlist, DIR='amazonReviews'):
    if not filename in listdir(DIR):
        path_to_filename = join(DIR, filename)
        for file in fromlist:
            with open(path_to_filename, 'a') as fp:
                fp.write(file+'\n')

createTextFile('reviews.txt', filenames)

# # Inside the path amazonReviews on terminal
# wget -i reviews.txt
# to download listed of files from urls

### Dowload files from urls

In [64]:
def copyInfo(info=''):
    with open('amazonReviews/info.txt', 'a') as fp:
        fp.write(info+'\n')

LIMIT = 100  # Assumed limit (G) of available space (before any data is copied)

In [65]:
def download(url, file_limit=5, LIMIT=100):
    """ Download file from url, transfer it into hadoop cluster """
    # extract file handle from url
    url = url.split('\n')[0] # remove end-of-line token
    filename = url.split('/')[-1]

    # check if file is already on the cluster
    found = !hdfs dfs -find /user/ubuntu/project -iname "{filename}"
    if len(found):
        print(f"File {found[0]} already exists!")
        return False
    
    # Check disk usage and abort if LIMIT has been exceeded
    used = !hdfs dfs -du -s /user/ubuntu/project/
    total_usage = (float(used[0].split()[1])) / 1e9 
    if total_usage > LIMIT:
        info = f"File {filename} NOT COPIED!\n" \
            + f"Disk usage limit {LIMIT}G already exceeded by {total_usage-LIMIT:.4f} G!"
        print(info)
        copyInfo(info)
        return False
    
    # Download file from web
    !wget -Q5010m "{url}"  # download limit at about 5G

    if getsize(filename) > (file_limit+0.001)*1e9:
        # remove files bigger than file_limit e.g. 5.0G
        copyInfo(f"File {filename} ({getsize(filename)/1e9:.2f}G) NOT COPIED!")
        !rm "{filename}"
        return False
    else:
        # move downloaded file from local namenode into hadoop cluster
        !hdfs dfs -moveFromLocal "{filename}" /user/ubuntu/project/
        return True


In [66]:
REPEAT_DOWNLOAD = False  # Must be intentional to change this flag (it takes hours to download!)
if REPEAT_DOWNLOAD:
    num_files = 0
    num_downloads = 0
    duration = time()
    with open('amazonReviews/reviews.txt', 'r') as fp:
        while True:
            url = fp.readline()
            if not url: 
                break
            # extract file handle from url
            file = (url.split('\n')[0]).split('/')[-1]
            # check if file is already on the cluster
            found = !hdfs dfs -find /user/ubuntu/project -iname "{file}"
            if len(found):
                print(f"File {found[0]} already exists!")
                continue
            else:
                # download and move file into cluster
                taken = download(url)
                if taken:
                    num_files += 1
                num_downloads += 1
            print('='*33, f"\n\tCopied.. {num_files}/{num_downloads}\n", '-'*33)
    duration = time() - duration

    print(f"Number of review categories/files: {num_files}")
    print(f"Donwload duration (raw data from web): {duration/60:.2f} minutes")
    copyInfo(f"Number of review categories/files: {num_files} (2.5G-3G)")
    copyInfo(f"Donwload duration (raw data from web [2.5G-3G]): {duration/60:.2f} minutes")

In [67]:
# Exception for books category
url = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/Books.jsonl.gz'
download(url, file_limit=6.5)


File /user/ubuntu/project/Books.jsonl.gz already exists!


False

In [68]:
# Total disk space occupied [size, replicas]
!hdfs dfs -du -s project

48103501477  96207002954  project


In [69]:
# Disk space occupied by each file [size, replicas]
!hdfs dfs -du -h project

90.9 M   181.9 M  project/All_Beauty.jsonl.gz
281.9 M  563.8 M  project/Amazon_Fashion.jsonl.gz
260.4 M  520.8 M  project/Appliances.jsonl.gz
993.8 M  1.9 G    project/Arts_Crafts_and_Sewing.jsonl.gz
2.2 G    4.3 G    project/Automotive.jsonl.gz
807.7 M  1.6 G    project/Baby_Products.jsonl.gz
2.8 G    5.6 G    project/Beauty_and_Personal_Care.jsonl.gz
5.8 G    11.6 G   project/Books.jsonl.gz
986.3 M  1.9 G    project/CDs_and_Vinyl.jsonl.gz
2.4 G    4.8 G    project/Cell_Phones_and_Accessories.jsonl.gz
25.0 M   50.0 M   project/Digital_Music.jsonl.gz
1.3 G    2.7 G    project/Electronics.jsonl.gz
12.0 M   24.0 M   project/Gift_Cards.jsonl.gz
1.5 G    3.1 G    project/Grocery_and_Gourmet_Food.jsonl.gz
75.0 M   150.0 M  project/Handmade_Products.jsonl.gz
3.0 G    6.0 G    project/Health_and_Household.jsonl.gz
66.9 M   133.9 M  project/Health_and_Personal_Care.jsonl.gz
651.5 M  1.3 G    project/Industrial_and_Scientific.jsonl.gz
4.3 G    8.6 G    project/Kindle_Store.jsonl.gz
9.6 M    19.

In [73]:
# Verify which urls have not been downloaded
with open('amazonReviews/reviews.txt', 'r') as fp:
    urls = fp.readlines()

urls = (''.join(urls)).split('\n')
urls = [url for url in urls if len(url.strip())]
total_num_urls = len(urls)
remain = 0
print("Files not copied:\n{}".format('='*33))
for url in urls:
    file = url.split('/')[-1]
    found = !hdfs dfs -find /user/ubuntu/project -iname "{file}"
    if not len(found):
        print(file)
        remain += 1
print('-'*33)
print(f"{remain}/{total_num_urls}")

Files not copied:
Clothing_Shoes_and_Jewelry.jsonl.gz
Home_and_Kitchen.jsonl.gz
Unknown.jsonl.gz
---------------------------------
3/34


### Scrape metadata urls

In [21]:
# For metedata
url2 = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories'
response = requests.get(url2)
response.status_code, response.ok

(200, True)

In [22]:
soup = BeautifulSoup(response.text, 'html.parser')

In [23]:
# Extract urls corresponding to review data files
filenames = [join(url2, a.text) for a in soup.find_all('a') \
                                if a.text.find('.jsonl.gz')!=-1]
print(filenames)
print(len(filenames))

['https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_All_Beauty.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Amazon_Fashion.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Appliances.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Arts_Crafts_and_Sewing.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Automotive.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Baby_Products.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Beauty_and_Personal_Care.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_Books.jsonl.gz', 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/meta_categories/meta_CDs_and_Viny

In [24]:
# Save this urls in a text file: meta_reviews.txt
if 'amazonReviews' not in listdir():
    !mkdir 'amazonReviews'

createTextFile('meta_reviews.txt', filenames)

### A single file

In [25]:
with open('amazonReviews/reviews.txt', 'r') as fp:
    urls = fp.readlines()

urls = (''.join(urls)).split('\n')
len(urls)

35

In [50]:
# Estimate storage requirement of a single file
l = random.randint(0, len(urls)-1)
print(f'url: {urls[l]}')
num_records = 0
store_size = 0
mega = 1000000

with pd.read_json(urls[l], lines=True, chunksize=100000) as reader:
    for chunk in reader:
        num_records = num_records + chunk.size
        chunk_store = chunk.__sizeof__()
        store_size = store_size + chunk_store
        print("chunk storage: {:,.2f} MB".format(chunk_store / mega))

print("shape: ({:,}, {})\nstorage requirements: {:,.2f} MB" \
      .format(num_records, chunk.shape[1], store_size/mega))

chunk.head()

url: https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/raw/review_categories/Appliances.jsonl.gz
chunk storage: 0.07 MB
chunk storage: 0.08 MB
chunk storage: 0.07 MB
chunk storage: 0.06 MB
chunk storage: 0.06 MB
chunk storage: 0.08 MB
chunk storage: 0.06 MB
chunk storage: 0.08 MB
chunk storage: 0.12 MB
chunk storage: 0.09 MB
chunk storage: 0.09 MB
chunk storage: 0.08 MB
chunk storage: 0.07 MB
chunk storage: 0.09 MB
chunk storage: 0.09 MB
chunk storage: 0.07 MB
chunk storage: 0.08 MB
chunk storage: 0.07 MB
chunk storage: 0.07 MB
chunk storage: 0.06 MB
chunk storage: 0.07 MB
chunk storage: 0.07 MB
chunk storage: 0.06 MB
chunk storage: 0.06 MB
chunk storage: 0.07 MB
chunk storage: 0.07 MB
chunk storage: 0.06 MB
chunk storage: 0.06 MB
chunk storage: 0.08 MB
chunk storage: 0.06 MB
chunk storage: 0.08 MB
chunk storage: 0.06 MB
chunk storage: 0.07 MB
chunk storage: 0.08 MB
chunk storage: 0.08 MB
chunk storage: 0.09 MB
chunk storage: 0.09 MB
chunk storage: 0.07 MB
chunk storage: 0.0

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
2128600,5,Accurate description,As described,[],B097948QRP,B097948QRP,AG6IN4MOTWF3743PKIPHYA2S7GXA,2021-08-02 15:33:52.936,0,True
2128601,3,Not compatible with Nespresso U Machine,I have tried multiple times with different reu...,[],B072QVZYDD,B0C6XG2JSG,AHVKX5FONDMQVOA7XLMPAH6EGZ2Q,2021-01-17 19:39:04.350,0,True
2128602,5,Works with Sears Kenmore model 36275585891,Exact fit for Sears Kenmore model 36275585891....,[],B07QKBMPG2,B07QKBMPG2,AEYETSNK5VL6ZSLN32EE6VCOAYFA,2020-01-02 17:38:56.721,1,True
2128603,5,Perfect little ice maker!,Love this!! It doesn’t keep the ice cold but t...,[],B07H7SGQ52,B07H7SGQ52,AHIJLNIXWVFQFWJV3OGGQOHONGMQ,2020-01-29 03:14:33.035,0,True
2128604,1,One Star,Did not work. Not the same model as the OEM we...,[],B011LZO5QW,B0C5BHXVH1,AGIOA4MS625NBQAUNM4PMJJX3Q7Q,2016-02-01 19:08:56.000,1,True
