In [None]:
from torch.utils.data import Dataset, DataLoader
import requests
from os.path import isdir
import os
import gzip
import shutil
import json
import pandas as pd

categories = [
    # 'Books',
    'Electronics',
    'Movies and TV',
    'CDs and Vinyl',
    'Clothing, Shoes and Jewelry',
    'Home and Kitchen',
    'Kindle Store',
    'Sports and Outdoors',
    'Cell Phones and Accessories',
    'Health and Personal Care',
    'Toys and Games',
    'Video Games',
    'Tools and Home Improvement',
    'Beauty',
    'Apps for Android',
    'Office Products',
    'Pet Supplies',
    'Automotive',
    'Grocery and Gourmet Food',
    'Patio, Lawn and Garden',
    'Baby',
    'Digital Music',
    'Musical Instruments',
    'Amazon Instant Video'
]

data_modes = ['5-core', 'ratings only']

class AmazonReveiws(Dataset):
    def __init__(self, category, root_dir='.', data_mode='5-core' ,transform=None, split=(80,10,10)):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        # creates root dir if does not exist
        if isdir(root_dir):
            self.root_dir = root_dir
        else:
            os.makedirs(root_dir)
            self.root_dir = root_dir

        # ensures a valid category is selected
        if category in categories:
            self.category = category.replace(',','').replace(' ', '_')
        else:
            raise ValueError(f'{category} is not a valid category')
        
        # ensures correct data mode is selected
        if data_mode in data_modes:
            self.data_mode = data_mode
        else:
            raise ValueError(f'{data_mode} is not a valid mode \n valid modes are {data_modes[0]} & {data_modes[1]}')
        
        self.__data_dir_base = f'{self.root_dir}/{self.category}/{self.data_mode}'
        
        # checks if data is downloaded
        if not isdir(self.__data_dir_base):
            os.makedirs(self.__data_dir_base)
            self.__download_data()
        

    def __download_data(self):

        if self.data_mode == '5-core':
            uri = f'reviews_{self.category}_5'
        else:
            uri = f'reviews_{self.category}'

        url = f'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/{uri}.json.gz'

        request = requests.get(url)

        with open(f'{self.__data_dir_base}/{uri}.json.gz', 'wb') as file1:
            file1.write(request.content)

        with gzip.open(f'{self.__data_dir_base}/{uri}.json.gz', 'rb') as f_in:
            with open(f'{self.__data_dir_base}/{uri}.json', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        json_file = open(f'{self.__data_dir_base}/{uri}.json', 'r')

        data = []

        keys = ['helpful','overall','reviewText']

        for line in json_file.readlines():
            data.append(json.loads(line))
        
        data = pd.DataFrame(data)
        data = data[keys]

        data.to_pickle(f'{uri}.pickle',compression={'method':'xz', 'compresslevel':'9'})






if __name__ == '__main__':
    for i in categories:
        AmazonReveiws(i)
    print('test')

KeyboardInterrupt: ignored

In [None]:
import json

file1 = open('./Patio_Lawn_and_Garden/5-core/reviews_Patio_Lawn_and_Garden_5.json','r')

lines = file1.readlines()
data = []

for idx, line in enumerate(lines):
    print(idx+1)
    data.append(json.loads(line))

In [None]:
import pandas as pd

data = pd.DataFrame(data)

In [None]:
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1JZFGZEZVWQPY,B00002N674,"Carter H ""1amazonreviewer@gmail . com""","[4, 4]",Good USA company that stands behind their prod...,4.0,Great Hoses,1308614400,"06 21, 2011"
1,A32JCI4AK2JTTG,B00002N674,"Darryl Bennett ""Fuzzy342""","[0, 0]",This is a high quality 8 ply hose. I have had ...,5.0,Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...,1402272000,"06 9, 2014"
2,A3N0P5AAMP6XD2,B00002N674,H B,"[2, 3]",It's probably one of the best hoses I've ever ...,4.0,Very satisfied!,1336176000,"05 5, 2012"
3,A2QK7UNJ857YG,B00002N674,Jason,"[0, 0]",I probably should have bought something a bit ...,5.0,Very high quality,1373846400,"07 15, 2013"
4,AS0CYBAN6EM06,B00002N674,jimmy,"[1, 1]",I bought three of these 5/8-inch Flexogen hose...,5.0,Good Hoses,1375660800,"08 5, 2013"


In [None]:
!rm -r Electronics/