In [1]:
import os
import argparse
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import csv

# Useful if you want to perform stemming.
import nltk

In [2]:
from nltk.stem.snowball import SnowballStemmer

In [3]:
snowball = SnowballStemmer("english")

In [5]:
categories_file_name = r'/workspace/datasets/product_data/categories/categories_0001_abcat0010000_to_pcmcat99300050000.xml'

queries_file_name = r'/workspace/datasets/train.csv'
output_file_name = r'/workspace/datasets/labeled_query_data.txt'
min_queries = 1000
root_category_id = 'cat00000'

In [6]:
tree = ET.parse(categories_file_name)
root = tree.getroot()

# Parse the category XML file to map each category id to its parent category id in a dataframe.
categories = []
parents = []
for child in root:
    id = child.find('id').text
    cat_path = child.find('path')
    cat_path_ids = [cat.find('id').text for cat in cat_path]
    leaf_id = cat_path_ids[-1]
    if leaf_id != root_category_id:
        categories.append(leaf_id)
        parents.append(cat_path_ids[-2])

categories.append(root_category_id)
parents.append(root_category_id)

parents_df = pd.DataFrame(list(zip(categories, parents)), columns =['category', 'parent'])

In [7]:
df = pd.read_csv(queries_file_name)[['category', 'query']]

In [8]:
df = df[df['category'].isin(categories)]

In [9]:
len(df)

1854998

In [10]:
df['query'] = df['query'].str.strip().str.lower()
df['query'] = df['query'].str.replace(r"[^a-zA-Z0-9\s]", " ", regex=True)
df['query'] = df['query'].str.replace(r"\s{2,}", " ", regex=True)
df['query'] = df.apply(lambda x: snowball.stem(x['query']), axis=1)
df['count'] = 1

In [11]:
category_lookup = parents_df.set_index('category').sort_index()

In [39]:
def new_cats_for(cats_with_lowq):
    def find_parent(x):
        if x['category'] in cats_with_lowq:
            return category_lookup.loc[x['category']]['parent']
        return x['category']
    return find_parent

In [45]:
print('initial total rows: ', len(df))
print('initial unique cats: ', len(df['category'].unique()))

initial total rows:  1854998
initial unique cats:  605


In [17]:
df_gbc = df[['category', 'count']].groupby(['category']).sum()

In [18]:
cats_with_lowq = df_gbc[df_gbc['count'] < min_queries].index

In [40]:
new_categories = df.apply(new_cats_for(cats_with_lowq), axis=1)

In [42]:
df['category'].compare(new_categories)

Unnamed: 0,self,other
10,pcmcat138100050040,abcat0515012
16,abcat0410020,pcmcat156200050014
23,abcat0515022,abcat0515020
24,abcat0515023,abcat0515020
31,pcmcat166600050000,abcat0107015
...,...,...
1865196,pcmcat226200050026,pcmcat226200050017
1865198,pcmcat191200050015,abcat0811002
1865201,pcmcat220700050010,abcat0103000
1865217,abcat0507011,abcat0507000


In [43]:
len(df)

1854998

In [44]:
df['category'] = new_categories

In [41]:
new_categories

0                abcat0101001
1                abcat0101001
2          pcmcat193100050014
3                abcat0101001
4                abcat0101005
                  ...        
1865264    pcmcat247400050000
1865265    pcmcat218000050000
1865266    pcmcat248500050020
1865267    pcmcat209000050008
1865268    pcmcat182300050008
Length: 1854998, dtype: object

In [None]:
si = low_counts.index.sort

In [None]:
'abcat0100000' in low_counts.index.sort

In [None]:
df_q = df[df['category'].isin(low_counts['category'])]

In [None]:
new_cats = df.apply(lambda x: category_lookup.loc[x['category']]['parent'] if x['category'] in low_counts.index else x['category'], axis=1)

In [None]:
df_q['category'].equals(new_cats)

In [None]:
df_q['category'].compare(new_cats)

In [None]:
len(new_cats)

In [None]:
len(df)

In [None]:
df['category'] = new_cats

In [None]:
df_q['category'] = new_cats

In [None]:
df.sort_values('count', ascending=False)

In [None]:
print('total rows: ', len(df))
print('unique cats: ', len(df['category'].unique()))

In [None]:
new_categories = df.apply(find_parent, axis=1)

In [None]:
df['category'].compare(new_categories)

In [None]:
df['category'] = new_categories

In [None]:
df.head()

### 2nd iteration

In [None]:
df = df.groupby(['category', 'query'], as_index=False)['count'].sum()

In [None]:
df.sort_values('count', ascending=False)

In [None]:
print('total rows: ', len(df))
print('unique cats: ', len(df['category'].unique()))

In [None]:
new_categories = df.apply(find_parent, axis=1)

In [None]:
df['category'].compare(new_categories)

In [None]:
df['category'] = new_categories

In [None]:
df.head()

In [None]:

print('initial total rows: ', len(df))
print('initial unique cats: ', len(df['category'].unique()))

complete = False
while not complete:
    df = df.groupby(['category', 'query'], as_index=False)['count'].sum()
    print('total rows: ', len(df))
    print('unique cats: ', len(df['category'].unique()))
    new_categories = df.apply(find_parent, axis=1)
    complete = df['category'].equals(new_categories)
    df['category'] = new_categories

# Create labels in fastText format.
df['label'] = '__label__' + df['category']

# Output labeled query data as a space-separated file, making sure that every category is in the taxonomy.
df = df[df['category'].isin(categories)]
df['output'] = df['label'] + ' ' + df['query']
df[['output']].to_csv(output_file_name, header=False, sep='|', escapechar='\\', quoting=csv.QUOTE_NONE, index=False)
