In [0]:
#!pip install fastai

import numpy as np
import pandas as pd
import requests
#import json
import ast
import os

#from fastai.vision import *

## Data file (citation)
Data from Dr. McAuley at UCSD (citation)
R. He, J. McAuley. Modeling the visual evolution of fashion trends with one-class collaborative filtering. WWW, 2016
J. McAuley, C. Targett, J. Shi, A. van den Hengel. Image-based recommendations on styles and substitutes. SIGIR, 2015


In [0]:
# Initial values for the datafile
split_num = 1
file_name = 'metadata_split_' + str(split_num)

## Important: Set variable 'local' to True if running on local machine, False if in cloud

In [0]:
local = False      # this needs to be automated (how?)

### Connect to gs://platform-ai-research/datasets/ProductNet
This is the data bucket - read/write all data files to this location

In [4]:
# Will need to login to access the bucket. The following lines of code do just that
if (not local):
    from google.colab import auth
    auth.authenticate_user()
    
    # List out all the files in the directory (Make sure that we are logged in)
    gs_path = 'platform-ai-research/datasets/ProductNet/'
    !gsutil ls gs://{gs_path}


CommandException: "ls" command does not support "file://" URLs. Did you mean to use a gs:// URL?


In [0]:
gs_path = 'platform-ai-research/datasets/ProductNet/'

In [7]:
!gsutil ls gs://{gs_path}

gs://platform-ai-research/datasets/ProductNet/metadata_0to500k.json
gs://platform-ai-research/datasets/ProductNet/metadata_split_1
gs://platform-ai-research/datasets/ProductNet/metadata_split_10
gs://platform-ai-research/datasets/ProductNet/metadata_split_11
gs://platform-ai-research/datasets/ProductNet/metadata_split_12
gs://platform-ai-research/datasets/ProductNet/metadata_split_13
gs://platform-ai-research/datasets/ProductNet/metadata_split_14
gs://platform-ai-research/datasets/ProductNet/metadata_split_15
gs://platform-ai-research/datasets/ProductNet/metadata_split_16
gs://platform-ai-research/datasets/ProductNet/metadata_split_17
gs://platform-ai-research/datasets/ProductNet/metadata_split_18
gs://platform-ai-research/datasets/ProductNet/metadata_split_19
gs://platform-ai-research/datasets/ProductNet/metadata_split_2
gs://platform-ai-research/datasets/ProductNet/metadata_split_3
gs://platform-ai-research/datasets/ProductNet/metadata_split_4
gs://platform-ai-research/datasets/Produ

### Define a few functions
These are used to clean up the data file

In [0]:
# The categories column is a nested list. We need to flatten it
def custom_flat(entry):
    if isinstance(entry[0], list):
        clist = []
        for i in range(len(entry)):
            clist += entry[i]
        return clist
    else:
        return list(entry)

def create_labels(labels):
    s = ', '
    s = s.join(labels)
    return s

def remove_category(df, category):
    print('Current number of Products in dataset =', len(df))
    print('Current number of Unique categories =', len(df.categories.unique()))    
    print('***************Removing Category:', category)
    df = df[~df.categories.str.contains(category)]
    print('Remaining number of Products in dataset =', len(df))
    print('Remaining number of Unique categories =', len(df.categories.unique()))
    return df

### Read in the data

In [18]:
# Read in the data from multiple files, and build the df
main_df = pd.DataFrame()                       # df built by concatenating data from multiple files
df = pd.DataFrame()

for split_num in range(1,4,1):
    data = []
    file_name = 'metadata_split_' + str(split_num)
    print('Reading file................', file_name)
    
    # copy datafile from gs bucket if executing in cloud and file not copied already
    exists = os.path.isfile(file_name)
    if (not exists):
        if (not local):
            # Copy the datafile to the Colab local dir
            remote_file = gs_path + file_name
            !gsutil cp gs://{remote_file} {file_name}
        else: 
            print('File Does Not Exist')
            break
        
    for line in open(file_name, 'r'):          # file_name is defined above
        #data.append(json.loads(line))         # this works for JSON, but our datafiles are not strictly json
        data.append(ast.literal_eval(line))    # JSON but single quotes instead of double
    
    # Convert to dataframe
    df = pd.DataFrame.from_dict(data)

    # Remove columns that we do not need
    df.drop(['brand', 'price', 'related', 'salesRank'], axis=1, inplace=True)

    # Fill in the NaN with empty string
    df.fillna(value='', axis='columns', inplace=True)

    # Flatten the categories list of list so it becomes easier to search
    df.categories = df.categories.apply(np.ravel)
    df.categories = df.categories.apply(custom_flat)
    df.categories = df.categories.apply(create_labels)

    # Remove all Books from the dataset
    df = df[~df.categories.str.contains('Books')]

    # Print out a few interesting details
    print('Number of Products in dataset =', len(df))
    print('Number of Unique categories =', len(df.categories.unique()))
    #print('Unique categories are: \n', df.categories.unique())
    
    main_df = main_df.append(df, ignore_index = True)
    df = pd.DataFrame()

print('\nCombined df:')
print('Number of Products in dataset =', len(main_df))
print('Number of Unique categories =', len(main_df.categories.unique()))

Reading file................ metadata_split_1
Number of Products in dataset = 1352
Number of Unique categories = 212
Reading file................ metadata_split_2
Copying gs://platform-ai-research/datasets/ProductNet/metadata_split_2...
\ [1 files][478.3 MiB/478.3 MiB]                                                
Operation completed over 1 objects/478.3 MiB.                                    
Number of Products in dataset = 6524
Number of Unique categories = 338
Reading file................ metadata_split_3
Copying gs://platform-ai-research/datasets/ProductNet/metadata_split_3...
- [1 files][489.5 MiB/489.5 MiB]                                                
Operation completed over 1 objects/489.5 MiB.                                    
Number of Products in dataset = 5933
Number of Unique categories = 524

Combined df:
Number of Products in dataset = 13809
Number of Unique categories = 814


In [12]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1352 entries, 0 to 1351
Data columns (total 5 columns):
asin           1352 non-null object
categories     1352 non-null object
description    1352 non-null object
imUrl          1352 non-null object
title          1352 non-null object
dtypes: object(5)
memory usage: 52.9+ KB


In [19]:
len(main_df.categories.unique())

814

In [20]:
df = remove_category(main_df, 'Software')

Current number of Products in dataset = 13809
Current number of Unique categories = 814
***************Removing Category: Software
Remaining number of Products in dataset = 13573
Remaining number of Unique categories = 753


In [21]:
df = remove_category(df, 'CDs & Vinyl')

Current number of Products in dataset = 13573
Current number of Unique categories = 753
***************Removing Category: CDs & Vinyl
Remaining number of Products in dataset = 12907
Remaining number of Unique categories = 524


In [22]:
df = remove_category(df, 'Video Games')

Current number of Products in dataset = 12907
Current number of Unique categories = 524
***************Removing Category: Video Games
Remaining number of Products in dataset = 12871
Remaining number of Unique categories = 519


In [23]:
df = remove_category(df, 'Movies')

Current number of Products in dataset = 12871
Current number of Unique categories = 519
***************Removing Category: Movies
Remaining number of Products in dataset = 4900
Remaining number of Unique categories = 487
