In [0]:
#!pip install fastai

import numpy as np
import pandas as pd
import requests
#import json
import ast
import os

#from fastai.vision import *

## Data file (citation)
Data from Dr. McAuley at UCSD (citation)
R. He, J. McAuley. Modeling the visual evolution of fashion trends with one-class collaborative filtering. WWW, 2016
J. McAuley, C. Targett, J. Shi, A. van den Hengel. Image-based recommendations on styles and substitutes. SIGIR, 2015


In [0]:
# Initial values for the datafile
split_num = 1
file_name = 'metadata_split_' + str(split_num)

## Important: Set variable 'local' to True if running on local machine, False if in cloud

In [0]:
local = False      # this needs to be automated (how?)

### Connect to gs://platform-ai-research/datasets/ProductNet
This is the data bucket - read/write all data files to this location

In [4]:
# Will need to login to access the bucket. The following lines of code do just that
if (not local):
    from google.colab import auth
    auth.authenticate_user()
    
    # List out all the files in the directory (Make sure that we are logged in)
    gs_path = 'platform-ai-research/datasets/ProductNet/'
    !gsutil ls gs://{gs_path}


CommandException: "ls" command does not support "file://" URLs. Did you mean to use a gs:// URL?


In [0]:
gs_path = 'platform-ai-research/datasets/ProductNet/'

In [7]:
!gsutil ls gs://{gs_path}

gs://platform-ai-research/datasets/ProductNet/metadata_0to500k.json
gs://platform-ai-research/datasets/ProductNet/metadata_split_1
gs://platform-ai-research/datasets/ProductNet/metadata_split_10
gs://platform-ai-research/datasets/ProductNet/metadata_split_11
gs://platform-ai-research/datasets/ProductNet/metadata_split_12
gs://platform-ai-research/datasets/ProductNet/metadata_split_13
gs://platform-ai-research/datasets/ProductNet/metadata_split_14
gs://platform-ai-research/datasets/ProductNet/metadata_split_15
gs://platform-ai-research/datasets/ProductNet/metadata_split_16
gs://platform-ai-research/datasets/ProductNet/metadata_split_17
gs://platform-ai-research/datasets/ProductNet/metadata_split_18
gs://platform-ai-research/datasets/ProductNet/metadata_split_19
gs://platform-ai-research/datasets/ProductNet/metadata_split_2
gs://platform-ai-research/datasets/ProductNet/metadata_split_3
gs://platform-ai-research/datasets/ProductNet/metadata_split_4
gs://platform-ai-research/datasets/Produ

### Define a few functions
These are used to clean up the data file

In [0]:
# The categories column is a nested list. We need to flatten it
def custom_flat(entry):
    if isinstance(entry[0], list):
        clist = []
        for i in range(len(entry)):
            clist += entry[i]
        return clist
    else:
        return list(entry)

def create_labels(labels):
    s = ', '
    s = s.join(labels)
    return s

def remove_category(df, category):
    print('Current number of Products in dataset =', len(df))
    print('Current number of Unique categories =', len(df.categories.unique()))    
    print('***************Removing Category:', category)
    df = df[~df.categories.str.contains(category)]
    print('Remaining number of Products in dataset =', len(df))
    print('Remaining number of Unique categories =', len(df.categories.unique()))
    return df

### Read in the data

In [57]:
# Read in the data from multiple files, and build the df
main_df = pd.DataFrame()                       # df built by concatenating data from multiple files
df = pd.DataFrame()

for split_num in range(1,7,1):
    data = []
    file_name = 'metadata_split_' + str(split_num)
    print('Reading file................', file_name)
    
    # copy datafile from gs bucket if executing in cloud and file not copied already
    exists = os.path.isfile(file_name)
    if (not exists):
        if (not local):
            # Copy the datafile to the Colab local dir
            remote_file = gs_path + file_name
            !gsutil cp gs://{remote_file} {file_name}
        else: 
            print('File Does Not Exist')
            break
        
    for line in open(file_name, 'r'):          # file_name is defined above
        #data.append(json.loads(line))         # this works for JSON, but our datafiles are not strictly json
        data.append(ast.literal_eval(line))    # JSON but single quotes instead of double
    
    # Convert to dataframe
    df = pd.DataFrame.from_dict(data)

    # Remove columns that we do not need
    df.drop(['brand', 'price', 'related', 'salesRank'], axis=1, inplace=True)

    # Fill in the NaN with empty string
    df.fillna(value='', axis='columns', inplace=True)

    # Flatten the categories list of list so it becomes easier to search
    df.categories = df.categories.apply(np.ravel)
    df.categories = df.categories.apply(custom_flat)
    df.categories = df.categories.apply(create_labels)

    # Remove some products from the dataset
    df = df[~df.categories.str.contains('Books')]
    df = df[~df.categories.str.contains('CDs & Vinyl')]
    df = df[~df.categories.str.contains('Software')]
    df = df[~df.categories.str.contains('Amazon')]
    df = df[~df.categories.str.contains('Movies')]
    df = df[~df.categories.str.contains('Video Games')]

    # Print out a few interesting details
    print('Number of Products in dataset =', len(df))
    print('Number of Unique categories =', len(df.categories.unique()))
    #print('Unique categories are: \n', df.categories.unique())
    
    main_df = main_df.append(df, ignore_index = True)
    df = pd.DataFrame()

print('\nCombined df:')
print('Number of Products in dataset =', len(main_df))
print('Number of Unique categories =', len(main_df.categories.unique()))

Reading file................ metadata_split_1
Number of Products in dataset = 810
Number of Unique categories = 153
Reading file................ metadata_split_2
Number of Products in dataset = 1484
Number of Unique categories = 186
Reading file................ metadata_split_3
Number of Products in dataset = 2606
Number of Unique categories = 326
Reading file................ metadata_split_4
Number of Products in dataset = 21543
Number of Unique categories = 2295
Reading file................ metadata_split_5
Number of Products in dataset = 265682
Number of Unique categories = 14267
Reading file................ metadata_split_6
Number of Products in dataset = 375271
Number of Unique categories = 19895

Combined df:
Number of Products in dataset = 667396
Number of Unique categories = 23624


In [65]:
# Use the following to remove further categories from the dataset
main_df = remove_category(main_df, 'Kindle')

Current number of Products in dataset = 667396
Current number of Unique categories = 23624
***************Removing Category: Kindle
Remaining number of Products in dataset = 667271
Remaining number of Unique categories = 23562


In [0]:
# Create a Label category
main_df['label'] = main_df.categories.str.split(',',1).str[0]

# Fixing some issues
main_df.label.replace(to_replace='Furniture & D&#233;cor', value='Furniture & Decor', inplace=True)

In [68]:
main_df.label.unique()

array(['Clothing', 'Sports & Outdoors', 'Toys & Games', '',
       'Musical Instruments', 'Tools & Home Improvement',
       'Home & Kitchen', 'Health & Personal Care',
       'Cell Phones & Accessories', 'Office Products', 'Electronics',
       'Office & School Supplies', 'Baby', 'Beauty', 'Automotive', 'Arts',
       'Computers', 'All Electronics', 'Pet Supplies',
       'Grocery & Gourmet Food', 'Kitchen & Dining',
       'Industrial & Scientific', 'Appliances', 'All Beauty',
       'Camera & Photo', 'Patio', 'Home Improvement', 'Baby Products',
       'Digital Music', 'MP3 Players & Accessories', 'Car Electronics',
       'Collectibles & Fine Art', 'Purchase Circles', 'GPS & Navigation',
       'Luxury Beauty', 'Magazine Subscriptions', 'Furniture & Decor',
       'Gift Cards Store', 'Sports Collectibles'], dtype=object)

In [69]:
# Write df as a csv file
file_name = 'df_from_splits_1thru6'
main_df.to_csv(file_name)

# Copy file to bucket if not running locally
remote_file = gs_path + file_name
if (not local):
    !gsutil cp {file_name} gs://{remote_file}

Copying file://df_from_splits_1thru6 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/402.4 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

|
Operation completed over 1 objects/402.4 MiB.                                    


In [70]:
!gsutil ls gs://{gs_path}

gs://platform-ai-research/datasets/ProductNet/df_from_splits_1thru6
gs://platform-ai-research/datasets/ProductNet/metadata_0to500k.json
gs://platform-ai-research/datasets/ProductNet/metadata_split_1
gs://platform-ai-research/datasets/ProductNet/metadata_split_10
gs://platform-ai-research/datasets/ProductNet/metadata_split_10_to_13
gs://platform-ai-research/datasets/ProductNet/metadata_split_11
gs://platform-ai-research/datasets/ProductNet/metadata_split_12
gs://platform-ai-research/datasets/ProductNet/metadata_split_13
gs://platform-ai-research/datasets/ProductNet/metadata_split_14
gs://platform-ai-research/datasets/ProductNet/metadata_split_14_to_19
gs://platform-ai-research/datasets/ProductNet/metadata_split_15
gs://platform-ai-research/datasets/ProductNet/metadata_split_16
gs://platform-ai-research/datasets/ProductNet/metadata_split_17
gs://platform-ai-research/datasets/ProductNet/metadata_split_18
gs://platform-ai-research/datasets/ProductNet/metadata_split_19
gs://platform-ai-rese

In [71]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 667271 entries, 0 to 667395
Data columns (total 6 columns):
asin           667271 non-null object
categories     667271 non-null object
description    667271 non-null object
imUrl          667271 non-null object
title          667271 non-null object
label          667271 non-null object
dtypes: object(6)
memory usage: 35.6+ MB


In [73]:
len(main_df.label.unique())

39