In [286]:
import pandas as pd
import numpy as np
data = pd.read_csv("training_data_DS_Specialist.csv")
data["raw_labels"] = data["labels"].apply(lambda x: x)
data["labels"] = data["labels"].apply(lambda x: eval(x))

In [287]:
all_categories = data["labels"].apply(lambda x: '_'.join(x))
max_heirarchy = np.max(data["labels"].apply(lambda x: len(x)).values)

In [289]:
domains = [ d for d, c in unique_labels[0].items() if c >= 50]
print(domains)

['fmcg', 'fmcd', 'automobiles and accessories', 'pharmaceuticals and medical supplies', 'stationery', 'agriculture']


In [290]:
data['raw_labels'] = data.labels.apply(lambda x : ' > '.join(x))
data.head()

Unnamed: 0,titles,labels,raw_labels
0,"patanjali honey, 250 gm bottle","[fmcg, branded grocery]",fmcg > branded grocery
1,cot mate imported - cotton pads square with no...,"[fmcg, home care, insect repellent, pest repel...",fmcg > home care > insect repellent > pest rep...
2,soft drink sprite,"[fmcg, beverages, soft drinks, cold drinks, ae...",fmcg > beverages > soft drinks > cold drinks >...
3,black pepper - get natures best - 50 g,"[fmcg, branded grocery]",fmcg > branded grocery
4,kamjove press art tea cup - tea culture - 1 u,"[fmcg, branded grocery, instant food]",fmcg > branded grocery > instant food


In [293]:
# default_depth = max_heirarchy
default_depth = 2
min_samples = 50


In [294]:
from collections import Counter

def depth(field, n, sep=' > '):
    if n <= 0:
        return field
    return sep.join(field.split(sep, n)[: n])

categories = Counter(
    depth(x, default_depth)
    for x in structured_df.raw_labels.values.tolist()
)


categories_filter = {}
for x in categories:
    if any([x.startswith(j) for j in domains]) or not domains:
        if categories[x] > min_samples:
            categories_filter[x] = categories[x]
            
            
categories_dict = {}

for cat in sorted(categories_filter):
    # noinspection PyRedeclaration
    parent = categories_dict

    for i in cat.split(' > '):
        parent = parent.setdefault(i, {})


def pretty(d, indent=0):
    for key, value in d.items():
        print(u'{} {} ({})'.format('    ' * indent, key, len(value)))
        pretty(value, indent + 1)

        
class_hierarchy = {}
class_hierarchy['ROOT'] = list(categories_dict)

def t(d):
    for k in d.keys():
        class_hierarchy[k] = list(d[k])
        t(d[k])
t(categories_dict)        
pretty(categories_dict)            

 agriculture (1)
     agricultural products (0)
 automobiles and accessories (1)
     car accessories (0)
 fmcd (2)
     cold storage (0)
     kitchenware (0)
 fmcg (16)
     baby care (0)
     bakery (0)
     beverages (0)
     branded grocery (0)
     dairy (0)
     dried fruits & nuts (0)
     fresh food non-veg (0)
     frozen food (0)
     fruits & vegetables (0)
     healthcare (0)
     home care (0)
     personal care (0)
     pet care (0)
     spices (0)
     staples (0)
     sweets & confectionery (0)
 pharmaceuticals and medical supplies (1)
     oral care (0)
 stationery (0)


In [295]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_predict
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import wordpunct_tokenize
from nltk.corpus import wordnet as wn
from functools import lru_cache
from nltk.tag.perceptron import PerceptronTagger
import matplotlib.pyplot as plt

In [296]:
# Initiate lemmatizer
wnl = WordNetLemmatizer()

# Load tagger pickle
tagger = PerceptronTagger()

# Lookup if tag is noun, verb, adverb or an adjective
tags = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}

# Memoization of POS tagging and Lemmatizer
lemmatize_mem = lru_cache(maxsize=10000)(wnl.lemmatize)
tagger_mem = lru_cache(maxsize=10000)(tagger.tag)


def tokenizer(text):
    for token in wordpunct_tokenize(text):
        if token not in ENGLISH_STOP_WORDS:
            tag = tagger_mem(frozenset({token}))
            yield lemmatize_mem(token, tags.get(tag[0][1],  wn.NOUN))

In [297]:
featurizer = Pipeline([
    ('vectorizer', TfidfVectorizer(
        tokenizer=tokenizer,
        ngram_range=(1, 2),
        stop_words=ENGLISH_STOP_WORDS,
        sublinear_tf=True,
        min_df=0.00009
    ))  
])
features = featurizer.fit_transform(data['titles'])
dense_features = features.toarray()

In [298]:
import pickle
pickle.dump(featurizer, open("terms_vectorizer", "wb"))

In [300]:
featurizer = pickle.load(open("terms_vectorizer", "rb"))

In [308]:
train_df = data.sample(frac=1)[0:10000]
train_df

Unnamed: 0,titles,labels,raw_labels
60315,schezwan instant noodles - chings - 300 g,"[fmcg, branded grocery, instant food]",fmcg > branded grocery > instant food
536873,meat masala - sanjeev kapoors khazana - 50 g,"[fmcg, branded grocery]",fmcg > branded grocery
371493,instant dosa breakfast mix - mtr - 500 g,"[fmcg, branded grocery]",fmcg > branded grocery
222059,"dove bathing bar - cream beauty, 100 gm ( pack...","[fmcg, personal care, cosmetics, make-up]",fmcg > personal care > cosmetics > make-up
184180,sugar strands - ccds - 125 g,"[fmcg, sweets & confectionery]",fmcg > sweets & confectionery
228101,blue cocktail fruit mix - apple & peach - fies...,"[fmcg, branded grocery, snacks]",fmcg > branded grocery > snacks
85961,salt plus - saffola - 1 kg,"[fmcg, branded grocery]",fmcg > branded grocery
156201,abbies piri piri extra hot sauce 155ml,"[fmcg, branded grocery, pickles]",fmcg > branded grocery > pickles
178516,fenugreek/methi seeds - organic - conscious fo...,"[fmcg, branded grocery]",fmcg > branded grocery
495093,tikka masala gravy - world chef - 390 g,"[fmcg, branded grocery]",fmcg > branded grocery


In [None]:
i = 0
models = {}
feature_selectors = {}
for k in class_hierarchy.keys():
    print(k)
    v = class_hierarchy[k]
    X_all = []
    y_all = []
    for classs in v:
        print("Processing",classs)
        temp_df = train_df[train_df['raw_labels'].str.contains(classs)][['titles']]
        temp_df['features'] = temp_df['titles'].apply(lambda x: featurizer.transform([x]).toarray()[0])
     
        X_all.extend(temp_df['features'].values)
        y_all.extend([classs] * temp_df.shape[0])
        
        assert len(X_all) == len(y_all), "Error, dimension mismatch"
        print("Adding ", temp_df.shape[0],  " records")
           
        print("Total Records updated to : ", len(X_all))

    if(len(X_all) > 50) and np.unique(y_all).shape[0] > 1:  
        print("Fitting the model")
        from sklearn.utils import class_weight
        from sklearn.model_selection import train_test_split
        from sklearn.model_selection import GridSearchCV
        from sklearn.svm import SVC
        from sklearn.feature_selection import SelectKBest, chi2
        assert len(X_all) == len(y_all), "Dimension of X and Y not same"    
        

        class_weights = class_weight.compute_class_weight('balanced', np.unique(y_all), y_all)
        class_weights= dict(list(zip(np.unique(y_all), class_weights)))
        for n in np.arange(100,9999,100):
            ch2 = SelectKBest(chi2, k=n)
            X_selected = ch2.fit_transform(X_all, y_all)
            feature_selectors[k] = ch2
        
    #    Too Slow on the laptop and colab
    #     classifier = SVC()
    #     grid_param = {'C' : [0.01, 0.1, 1, 10],
    #                   'kernel': ('rbf', 'linear'), 
    #                   'class_weight': [class_weights],
    #                  'probability'=[True]}
        classifier = SGDClassifier()

        grid_param = {
            'loss':['log'],
            'class_weight': [class_weights],
            'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
            'n_jobs': [-1]
        }
        gd_sr = GridSearchCV(estimator=classifier,
                         param_grid=grid_param,
                         scoring='accuracy',
                         cv=5,
                         n_jobs=-1)
        gd_sr.fit(X_selected, y_all)

        models[k] = gd_sr.best_estimator_ 
        print("Completed")
    
    i+=1
    if i >3 :
        break    

ROOT
Processing agriculture
Adding  3  records
Total Records updated to :  3
Processing automobiles and accessories
Adding  3  records
Total Records updated to :  6
Processing fmcd
Adding  20  records
Total Records updated to :  26
Processing fmcg
Adding  9967  records
Total Records updated to :  9993
Processing pharmaceuticals and medical supplies
Adding  3  records
Total Records updated to :  9996
Processing stationery
Adding  4  records
Total Records updated to :  10000
Fitting the model


In [None]:
def predict_hierarchy(title):
    def predict_by_key(k):

        if k in feature_selectors.keys():
            f = feature_selectors[k].transform(featurizer.transform([title]).toarray())
            label = models[k].predict(f)
            probability = models[k].predict_proba(f)
            return label[0], np.max(probability)

    final_label = []  
    label_probabs = []
    
    def pred(key):
        if predict_by_key(key):
            if len(class_hierarchy[key]) > 0:
                label, conf = predict_by_key(key)
            else:
                label, conf = key, 1

            final_label.append(label)
                label_probabs.append(str(conf))
                pred(label)    
            

    pred('ROOT')  
    return ' > '.join(final_label), ' > '.join(label_probabs)

In [305]:
predict_hierarchy('patanjali honey, 250 gm bottle')

('fmcg', '0.8830679493549239')