In [88]:
import pandas as pd
import json
from nltk.stem.wordnet import WordNetLemmatizer
from random import random, seed
from collections import Counter

In [77]:
seed(5)

In [78]:
stem_prob = 1.0
capitalise_prob = 0.3
upper_prob = 0.2

### Flipkart dataset

In [79]:
path = 'flipkart/flipkart_com-ecommerce_sample.csv'

In [80]:
stemmer = WordNetLemmatizer()

In [81]:
df = pd.read_csv(path)

In [82]:
products = []

In [83]:
# Split the tree, take non-terminal categories
# Use stemmer and case variations probabilitically
for stree in df['product_category_tree'].unique().tolist():
    tree = json.loads(stree)
    candidates = [item.strip() for item in tree[0].split('>>')[:-1]]
    stem_processed = [stemmer.lemmatize(candidate.lower()) if random()<stem_prob else candidate for candidate in candidates]
    case_processed = []
    for cand in stem_processed:
        if cand.islower() and random()<capitalise_prob:
            case_processed.append(cand.capitalize())
        else:
            case_processed.append(cand)
    case_processed  = [cand.upper() if random()<upper_prob else cand for cand in case_processed]
    products.extend(case_processed)

In [73]:
stemmer.lemmatize('shorts')

'short'

In [90]:
freq = Counter(products)

In [93]:
with open('products.txt', 'w') as f:
    f.write('\n'.join(set(products))+'\n')

### UNCPC Dataset

In [None]:
from io import StringIO

In [109]:
with open('flipkart/CPC_Ver_2_1_english_structure.txt', errors='ignore') as f:
    fo = StringIO(f.read())

In [110]:
df = pd.read_csv(fo )

In [122]:
df[df['CPC21code']//1000>0]

Unnamed: 0,CPC21code,CPC21title
4,1111,"Wheat, seed"
5,1112,"Wheat, other"
7,1121,"Maize (corn), seed"
8,1122,"Maize (corn), other"
10,1131,"Rice, seed"
11,1132,"Rice paddy, other (not husked)"
13,1141,"Sorghum, seed"
14,1142,"Sorghum, other"
16,1151,"Barley, seed"
17,1152,"Barley, other"


In [137]:
items = df['CPC21title'].unique().tolist()

In [151]:
# Prune product specifications

token_threshold = 5

prods = []
for item in items:
    if len(item.split())<token_threshold:
        if ',' in item:
            prods.append(item[:item.find(',')])
        else:
            prods.append(item)
        

In [153]:
prods

['Cereals',
 'Wheat',
 'Wheat',
 'Wheat',
 'Maize (corn)',
 'Maize (corn)',
 'Maize (corn)',
 'Rice',
 'Rice',
 'Sorghum',
 'Sorghum',
 'Sorghum',
 'Barley',
 'Barley',
 'Barley',
 'Rye',
 'Rye',
 'Rye',
 'Oats',
 'Oats',
 'Oats',
 'Millet',
 'Millet',
 'Millet',
 'Other cereals',
 'Triticale',
 'Buckwheat',
 'Fonio',
 'Quinoa',
 'Canary seed',
 'Other cereals n.e.c.',
 'Vegetables',
 'Leafy or stem vegetables',
 'Asparagus',
 'Cabbages',
 'Cauliflowers and broccoli',
 'Lettuce and chicory',
 'Spinach',
 'Artichokes',
 'Melons',
 'Watermelons',
 'Cantaloupes and other melons',
 'Fruit-bearing vegetables',
 'Cucumbers and gherkins',
 'Eggplants (aubergines)',
 'Tomatoes',
 'Pumpkins',
 'Other fruit-bearing vegetables',
 'Green leguminous vegetables',
 'Beans',
 'Peas',
 'Other green leguminous vegetables',
 'Carrots and turnips',
 'Green garlic',
 'Onions',
 'Mushrooms and truffles',
 'Vegetables',
 'Fruits and nuts',
 'Tropical and subtropical fruits',
 'Avocados',
 'Bananas',
 'Planta

In [154]:
len(prods)

1058

In [156]:
with open('un_products_list.txt', 'w') as f:
    f.write('\n'.join(prods)+'\n')