In [1]:
import joblib,re,os
import numpy as np
import pandas as pd

# read names 

put `Amazon_RawData/titles.txt` in dataframe with product id

In [2]:
with open('./Amazon_RawData/titles.txt','r',encoding = "ISO-8859-1") as f:
    lines = f.read().splitlines()

In [3]:
data = []
invalid = []
pattern = re.compile("[^A-Z0-9]+")
for i,line in enumerate(lines):
    if len(line)<11 or line[10]!=' ':
        invalid.append(i)
        continue
    id = line[:10]
    if pattern.search(id):
        invalid.append(i)
        continue
    else:
        data.append({'id':id,'name':line[11:]})
names = pd.DataFrame.from_dict(data)

In [4]:
# make sure id is unique
names.id.is_unique

True

In [5]:
names.describe()

Unnamed: 0,id,name
count,1720286,1720286.0
unique,1720286,1495904.0
top,B000N7Q7R0,
freq,1,1270.0


# read descriptions

In [6]:
with open('./Amazon_RawData/descriptions.txt','r',encoding = "ISO-8859-1") as f:
    lines = f.read().splitlines()

In [7]:
data = []
for i,line in enumerate(lines):
    if not line:
        continue
    if line[:18]=='product/productId:':
        _,id = line.split(' ')
        data.append({'id':id})
    elif line[:20] == 'product/description:':
        _,description = line.split(' ',1)
        data[-1]['description'] = description
    else:
        data[-1]['description'] = data[-1]['description']+line

In [8]:
descriptions = pd.DataFrame.from_dict(data)
descriptions.id.is_unique

True

In [9]:
descriptions.describe()

Unnamed: 0,description,id
count,1495010,1495010
unique,1204096,1495010
top,All products are BRAND NEW and factory sealed....,140433872
freq,6710,1


# read categories

In [10]:
with open('./Amazon_RawData/categories.txt','r',encoding = "ISO-8859-1") as f:
    lines = f.read().splitlines()

In [11]:
data = []
invalid = []
for i,line in enumerate(lines):
    if line[0]!=' ':
        data.append({'id':line,'categories':[]})
    elif lines[2][:2]=='  ':
        cats = [t.strip().lower() for t in line.split(',')]
        data[-1]['categories'] = data[-1]['categories']+ cats
    else:
        invalid.append[i]

In [12]:
categories = pd.DataFrame.from_dict(data)
categories.id.is_unique

True

In [13]:
categories['categories'] = categories['categories'].apply(lambda x:set(x))

In [14]:
all_cat = set([t for cat in categories.categories.to_list() for t in cat])

In [15]:
with open('./AmazonCat-13K_mappings/AmazonCat-13K_label_map.txt','r',encoding = "ISO-8859-1") as f:
    cats = f.read().splitlines()

In [16]:
if len(cats)==len(set(cats)):
    cats = set([cat.strip().lower() for cat in cats])
else:
    print('NO!')

In [17]:
# all categories has samples
cats-all_cat

set()

In [18]:
categories['categories'] = categories['categories'].apply(lambda x:x&cats)

In [19]:
categories.head()

Unnamed: 0,categories,id
0,"{tv, movies & tv, music, classical}",B0027DQHA0
1,"{literature & fiction, short stories, science ...",0756400120
2,"{blues, pop, music, r&b}",B0000012D5
3,"{business & investing, books, business life, m...",B00024YAOQ
4,{books},068413263X


# check id

In [25]:
pattern = re.compile("[^A-Z0-9]+")
for id in [ids for df in [names,descriptions,categories,] for ids in  df.id.to_list()]:
    if pattern.search(id) or len(id)!=10:
        print('invalid id: {}'.format(id))

# extract train/test id
from `AmazonCat-13K_mappings\AmazonCat-13K_test_map.txt`

In [57]:
traindf = pd.read_csv(
    "AmazonCat-13K_mappings/AmazonCat-13K_train_map.txt",
    sep=r'->',
    header=None,
    names=['id','title_mappings'],
)
traindf = traindf.drop(columns = 'title_mappings')
traindf.id.is_unique

  """


True

In [58]:
testdf = pd.read_csv(
    "AmazonCat-13K_mappings/AmazonCat-13K_test_map.txt",
    sep=r'->',
    header=None,
    names=['id','title_mappings'],
)
testdf = testdf.drop(columns = 'title_mappings')
testdf.id.is_unique

  """


True

In [59]:
# no duplicated ids
trainid = set(traindf.id.to_list())
testid = set(testdf.id.to_list())
testid&trainid

set()

In [61]:
# test train id are subsets of df
(testid.union(trainid)).issubset(set(categories.id.to_list()))

# create raw train test set

In [64]:
names = names.set_index('id')
descriptions = descriptions.set_index('id')
categories = categories.set_index('id')
traindf = traindf.set_index('id')
testdf = testdf.set_index('id')

In [65]:
traindf['test/train']='train'
testdf['test/train']='test'

In [71]:
df = pd.concat([traindf,testdf])

In [72]:
df = df.merge(categories,on='id',how='left').merge(names,on='id',how='left').merge(descriptions,on='id',how='left')

In [73]:
# no empty cat 
print(df['categories'].isnull().values.any())
print(df['description'].isnull().values.any())
print(df['name'].isnull().values.any())

False
False
True


# text cleaning

In [74]:
df.head()

Unnamed: 0_level_0,test/train,categories,name,description
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0873386523,train,"{19th century, history, books, state & local, ...",New Englanders on the Ohio Frontier: Migration...,Virginia E. McCormick served on the faculties ...
B00007GUH9,train,"{music, world music}",Le Petit Prince,Deluxe Double Disc Musical Spectacular that Co...
B0002WSQI4,train,"{home & kitchen, specialty appliances, small a...",Nesco FS-120T American Harvest Food Slicer wit...,Heavy duty construction plus innovative design...
B00005NTSR,train,"{pop, music, world music}",Hungarian Music,All products are BRAND NEW and factory sealed....
B000JFHMR0,train,"{tools & equipment, diagnostic & test tools, a...",ESI 585K Deluxe Automotive DMM,The Deluxe Automotive DMM is a professional gr...


## read features

In [75]:
with open('./AmazonCat-13K_mappings/AmazonCat-13K_feature_map.txt','r',encoding = "ISO-8859-1") as f:
    lines = f.read().splitlines()

In [79]:
pattern = re.compile("[A-Z]+")
for line in lines:
    if pattern.search(line):
        print('uppercase detected: '.format(line))

In [87]:
token_pattern= re.compile('(?u)\\b\\w+\\b')
def get_line(x):
    # clean text and tockenize
    tokenized = token_pattern.findall(x['description'].lower())
    # add label
    labels = ['__label__'+cat.lower().replace(" ", "_") for cat in list(x['categories'])]
    line = labels + tokenized
    line = ' '.join(line)
    return line

In [88]:
df['text'] = df.apply(get_line,axis=1)

In [89]:
df.head()

Unnamed: 0_level_0,test/train,categories,name,description,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0873386523,train,"{19th century, history, books, state & local, ...",New Englanders on the Ohio Frontier: Migration...,Virginia E. McCormick served on the faculties ...,__label__19th_century __label__history __label...
B00007GUH9,train,"{music, world music}",Le Petit Prince,Deluxe Double Disc Musical Spectacular that Co...,__label__music __label__world_music deluxe dou...
B0002WSQI4,train,"{home & kitchen, specialty appliances, small a...",Nesco FS-120T American Harvest Food Slicer wit...,Heavy duty construction plus innovative design...,__label__home_&_kitchen __label__specialty_app...
B00005NTSR,train,"{pop, music, world music}",Hungarian Music,All products are BRAND NEW and factory sealed....,__label__pop __label__music __label__world_mus...
B000JFHMR0,train,"{tools & equipment, diagnostic & test tools, a...",ESI 585K Deluxe Automotive DMM,The Deluxe Automotive DMM is a professional gr...,__label__tools_&_equipment __label__diagnostic...


# save for FastText

In [91]:
train_df = df[df['test/train']=='train']
test_df = df[df['test/train']=='test']

In [93]:
OUTDIR='data'
if not os.path.exists(OUTDIR):
    os.mkdir(OUTDIR)
np.savetxt(r'{}/amazon.train.txt'.format(OUTDIR), train_df.text, fmt='%s')
np.savetxt(r'{}/amazon.test.txt'.format(OUTDIR), test_df.text, fmt='%s')