# References & starter code


https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py

In [1]:
import numpy as np

import xgboost as xgb

import pandas as pd
from tqdm import tqdm

from scipy import sparse
from scipy import stats
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder 

In [2]:
xgb.__version__


'1.3.3'

In [3]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
import re

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()
import pickle

In [5]:
result_dd = dd.read_parquet('data/full_cleaned')

In [6]:
all_labels = result_dd.label.unique()
print('Number of classes : ', len(all_labels))

[########################################] | 100% Completed |  4.6s
Number of classes :  4137


In [7]:
len(result_dd)

[########################################] | 100% Completed |  4.4s


1982138

In [8]:
result = result_dd.compute()

[########################################] | 100% Completed |  4.3s


# Lemmatizer


In [9]:
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

from nltk.tokenize import RegexpTokenizer
tok = RegexpTokenizer(r'\w+')
    
def clean_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    sentence = re.sub(r'\d+', '', sentence)
    remove_dig_pun = tok.tokenize(sentence.lower())

    nltk_tagged = nltk.pos_tag(remove_dig_pun)  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
            
    lemmatized_sentence_clean = list(map((lambda x : x if x not in stop else ""), lemmatized_sentence))
    input_clean = list(map((lambda x : x if x not in stop else ""), remove_dig_pun))

    return " ".join(input_clean + lemmatized_sentence_clean)

# Read the HTS descriptions, process


In [10]:
hts = pd.read_csv("./hts_train.csv", dtype={'hs': str, 'desc' : str})

In [11]:
hts['clean'] = hts.desc.apply(lambda x : clean_sentence(x))
hts = hts.rename({'hs' : 'label', 'clean' : 'text'}, axis=1)[['label', 'text']]

In [12]:
hts.head(1)

Unnamed: 0,label,text
0,10121,live horses asses mules hinnies horses purebr...


# Read the NACIS->HTS examples, process


In [13]:
nacis = pd.read_csv("commodity_hts_extract.csv", dtype={'hts6': str, 'description_long' : str})

In [14]:
nacis['clean'] = nacis.description_long.apply(lambda x : clean_sentence(x))

In [15]:
nacis = nacis.rename({'hts6' : 'label', 'clean' : 'text'}, axis=1)[['label', 'text']]

In [16]:
nacis.head(1)

Unnamed: 0,label,text
0,910211,batteries wrist watches battery powered mech...


# Sample the full data, 4137 classes. Create a subset of 100 examples each per class. Save it off

In [17]:
all_train_df = []
all_valid_df = []
count = 600
recompute = False

if not recompute :
    with open('saved_v3/all_train_df_600.pkl', 'rb') as f: all_train_df = pickle.load(f)
    with open('saved_v3/all_valid_df_600.pkl', 'rb') as f: all_valid_df = pickle.load(f)
    pass


for c in tqdm(all_labels) :
    if (not recompute) : break
    df = result[result.label == c]
    df_sampled = df.sample(frac=min(count/len(df), 1))
    df_hts = hts[hts.label == c]
    df_nacis  = nacis[nacis.label == c]
    
    
    train_df  = df_sampled.sample(frac=0.8)
    valid_df = df_sampled.drop(train_df.index)

    all_train_df.append(train_df)
    all_train_df.append(df_hts)
    all_train_df.append(df_nacis)
    
    all_valid_df.append(valid_df)

train_df  = pd.concat(all_train_df)
valid_df  = pd.concat(all_valid_df)

if recompute :
    with open('saved_v3/all_train_df_600.pkl', 'wb') as f: pickle.dump(all_train_df, f)
    with open('saved_v3/all_valid_df_600.pkl', 'wb') as f: pickle.dump(all_valid_df, f)
    pass

[########################################] | 100% Completed |  4.4s


  0%|          | 0/4137 [00:00<?, ?it/s]

[########################################] | 100% Completed |  4.3s


In [18]:
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
wn_lemmas = set(wordnet.words())
print(len(wn_lemmas))
'threadmill' in wn_lemmas

147306


False

In [75]:
#import spacy

# nlp = spacy.load('en_core_web_md')
# list(nlp.vocab.strings)

In [60]:
[w for w in word_tokenize('Hello this is cool xxx1'.lower()) if w in wn_lemmas]

['hello', 'cool']

In [18]:
all_data_1 = pd.concat([train_df, valid_df])
all_data_1['token_present'] = all_data_1.apply(lambda x : x)
#nacis['clean'] = nacis.description_long.apply(lambda x : clean_sentence(x))

In [19]:
len(valid_df['text']), len(train_df['text']), len(train_df['text'])/50000

(286546, 1171158, 23.42316)

# Construct count vectorizer with HTS, NACIS keywords

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vector = CountVectorizer(max_features=30000)
count_vector.fit(list(hts['text']) + list(nacis['text']))

CountVectorizer(max_features=30000)

In [21]:
test = count_vector.transform(list(hts['text']) + list(nacis['text']))
test.shape

(34707, 14588)

In [22]:
X_train_counts = count_vector.transform(list(train_df['text']))
X_train_counts.shape

(1171158, 14588)

In [23]:
label_enc = LabelEncoder() 
label_enc.fit(result['label']) 
y_train = np.expand_dims(np.array(label_enc.transform(train_df['label'])), 1)

In [57]:
X_train_counts.shape, y_train.shape

((1171158, 14588), (1171158, 1))

In [58]:
#pickle.dump(count_vector, open("saved_v3/count_vector.pickle", "wb"))

# Batch logic for GBT


In [59]:
def batch(csr, y, rows, random_row_array, n=1):
    l = len(rows)
    for ndx in range(0, l, n):
        yield (csr[random_row_array[ndx:min(ndx + n, l)]].todense(), 
               y[random_row_array[ndx:min(ndx + n, l)]])

# Enable batching

In [60]:
from datetime import datetime
from pytz import timezone, utc

def get_pst_time():
    date_format='%m/%d/%Y %H:%M:%S:%Z'
    date = datetime.now(tz=utc)
    date = date.astimezone(timezone('US/Pacific'))
    pstDateTime=date.strftime(date_format)
    return pstDateTime

In [61]:
class make_model():
    def __init__(self, param, lr, num_round = 5, batch_size=1000):
        self.param     = param
        self.num_round = num_round
        self.batch_size = batch_size
        self.lr = lr
        
    def fit(self, csr, y_val):
        iteration = 0
        print("Will run for {} rounds".format(self.num_round))
        for n_round in range(0, self.num_round):     
            random_row_array = np.random.choice(np.arange(csr.shape[0]), csr.shape[0], replace=False)
            rows = range(0, csr.shape[0])
            with tqdm(total=int(len(rows)/self.batch_size)) as progress_bar:
                for x,y in batch(csr, y_val, rows, random_row_array, self.batch_size):
                    dtrain = xgb.DMatrix(x, y)
                    watchlist = [(dtrain,'train')]

                    if iteration == 0 : model = xgb.Booster(self.param, [dtrain])
                    
                    self.param['eta'] = self.lr[iteration]
                    print('{} : Round = {}, Iteration = {}, lr = {}'.format(get_pst_time(), n_round, iteration, self.lr[iteration]))
                    
                    model = xgb.train(self.param, dtrain, num_boost_round=1, xgb_model=model, evals=watchlist)
                    iteration = iteration + 1
                    progress_bar.update(1)
            
            name = 'saved_v3/xgb_model_{}'.format(n_round)
            print("saving model: ", name)
            model.save_model(name)
                
        self.model  = model

# Carefully adjust Learning rate for each iteration so that training converges (else it won't work)

In [62]:
#lr =  [0.3]*100 + [0.1] * 20 + [0.05]*100 
lr =  [0.45]*24*1 + [0.3]*24*1 + [0.2] * 23 * 2 + [0.1] * 23 * 4 +  [0.05]* 10 * 100

parameters = {'max_depth':5, 'objective':'multi:softprob', 'subsample':0.8, 
            'colsample_bytree':0.8, 'eta': 0.3, 'min_child_weight':0.1,
            'tree_method':'gpu_hist', 'gpu_id': 0, 'num_class' : len(all_labels)
            }

model = make_model(parameters, lr, num_round=8, batch_size=50000) 
#model.fit(X_train_counts, y_train)

[########################################] | 100% Completed |  4.8s


# Load the saved, trained model for further experiments¶


In [63]:
model.model = xgb.Booster(model_file='saved_v3/xgb_model_7')

# Do prediction in batches, else it will crash with out of memory errors (OOM)

In [31]:
def batch(lst, n=1):
    l = len(lst)
    for ndx in range(0, l, n):
        yield lst[ndx:min(ndx + n, l)]

def do_predict_batch(input):
    first  = []
    second = []
    third  = []
    fourth = []
    fifth  = []
    
    with tqdm(total=len(valid_df['text'])) as progress_bar:
        for x in batch(input, 2000):
            tmp_valid_counts = count_vector.transform(x)
            tmp_predict_da = model.model.predict(xgb.DMatrix(tmp_valid_counts.todense()))
            sorted_idx = np.argsort(-tmp_predict_da)
            first = first + list(label_enc.inverse_transform(list(sorted_idx[:,0])))
            second = second + list(label_enc.inverse_transform(list(sorted_idx[:,1])))
            third = third + list(label_enc.inverse_transform(list(sorted_idx[:,2])))
            fourth = fourth + list(label_enc.inverse_transform(list(sorted_idx[:,3])))
            fifth = fifth + list(label_enc.inverse_transform(list(sorted_idx[:,4])))
            progress_bar.update(2000)
        return first, second, third, fourth, fifth

y1, y2, y3, y4, y5 = do_predict_batch(list(valid_df['text']))

  0%|          | 0/286546 [00:00<?, ?it/s]

# Calculate accuracy for top 5 classes (For the big model)


In [32]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, plot_confusion_matrix, accuracy_score

a1 = accuracy_score(list(valid_df['label']), y1)
a2 = accuracy_score(list(valid_df['label']), y2)
a3 = accuracy_score(list(valid_df['label']), y3)
a4 = accuracy_score(list(valid_df['label']), y4)
a5 = accuracy_score(list(valid_df['label']), y5)

print('Accuracy:', a1, a2, a3, a4, a5)
print('Total:', a1+a2+a3+a4+a5)

Accuracy: 0.5924040119212971 0.09369874295924564 0.03737619788794818 0.022317533659517146 0.015205237553481814
Total: 0.7610017239814899


# Lookup number of classes per chapter and generate masks for each chapter
**Masks are used to mask off probablity from other chapters, while computing per chapter accuracy**

In [199]:
lookup = []
all_class_list = list(label_enc.classes_)
for i, l in enumerate(all_class_list) :
    lookup.append({ 'Index': i, 'chapter' : "".join(list(l)[:2]), 'label' : l})
    
grouped_lookup = pd.DataFrame(lookup)

grouped_lookup.groupby(['chapter'])

lookup_with_mask = {}
for c, df in grouped_lookup.groupby(['chapter']) :
    mask = np.array([0] * len(all_class_list)) 
    index_list = list(df.Index.unique())
    mask[index_list] = 1
    lookup_with_mask[c] = mask
        
lookup_with_mask

grouped_lookup['mask'] = grouped_lookup.chapter.apply(lambda x : lookup_with_mask[x])


# Give label as key, give the mask back
label_2_mask    = { l: m for l, m in zip(list(grouped_lookup['label']), list(grouped_lookup['mask']))}

# Give the label as key, get the corresponding chapter back
label_2_chapter = { l: m for l, m in zip(list(grouped_lookup['label']), list(grouped_lookup['chapter']))}

grouped_lookup.head()

Unnamed: 0,Index,chapter,label,mask
0,0,9,90111,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,9,90112,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,9,90121,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,3,9,90122,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,9,90190,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Find out number of classes available per chapter. 

In [200]:
# Give chapter as key, give number of classes in that chapter back
per_chapter_labels_ser = grouped_lookup.groupby(['chapter']).count()
per_chapter_labels = { l: c for l, c in zip(list(per_chapter_labels_ser.index), list(list(per_chapter_labels_ser['mask'])))}

# Do the predictions.

* While computing prediction for a class from a chapter, mask off probablity from other chapters by setting them to zero before argmax
* Stack the masks per batch to make dimentions right
* Element multiply with original probablity array

In [116]:
def batch_1(lst, labels, n=1):
    l = len(lst)
    for ndx in range(0, l, n):
        yield (lst[ndx:min(ndx + n, l)], labels[ndx:min(ndx + n, l)])

def do_predict_batch(data, labels):
    first  = []
    second = []
    third  = []
    fourth = []
    fifth  = []
    
    with tqdm(total=len(valid_df['text'])) as progress_bar:
        for x, labels in batch_1(data, labels, 1000):
            tmp_valid_counts = count_vector.transform(x)
            
            raw_pred = model.model.predict(xgb.DMatrix(tmp_valid_counts.todense()))
            
            # Mask the probablities from other chapters.
            masks = map(lambda x : label_2_mask[x], labels)
            full_mask = np.vstack(list(masks))
            
            tmp_predict_da = np.multiply(full_mask, raw_pred)

            
            sorted_idx = np.argsort(-tmp_predict_da)
            first = first + list(label_enc.inverse_transform(list(sorted_idx[:,0])))
            second = second + list(label_enc.inverse_transform(list(sorted_idx[:,1])))
            third = third + list(label_enc.inverse_transform(list(sorted_idx[:,2])))
            fourth = fourth + list(label_enc.inverse_transform(list(sorted_idx[:,3])))
            fifth = fifth + list(label_enc.inverse_transform(list(sorted_idx[:,4])))
            progress_bar.update(1000)
        return first, second, third, fourth, fifth

y1, y2, y3, y4, y5 = do_predict_batch(list(valid_df['text']), list(valid_df['label']))

  0%|          | 0/286546 [00:00<?, ?it/s]

# Set the predictions and labels into a single dataframe

* Save the dataframe into a pickle as compute is expensive interms of time

In [119]:
final_df = pd.DataFrame(list(zip(list(valid_df['label']), y1, y2, y3, y4, y5)), 
               columns =['label', 'first', 'second', 'third', 'fourth', 'fifth']) 

In [125]:
#final_df.to_pickle("./final_df.pkl")
final_df = pd.read_pickle("./final_df.pkl")
final_df.shape

(286546, 6)

In [202]:
# Add chapter name
final_df['chapter'] = final_df.label.apply(lambda x : label_2_chapter[x])
final_df.head()

Unnamed: 0,label,first,second,third,fourth,fifth,chapter
0,90111,90111,90122,90190,90112,90121,9
1,90111,90111,90122,90190,90112,90220,9
2,90111,90190,90121,90111,90122,90112,9
3,90111,90111,90190,90121,90112,90620,9
4,90111,90111,90122,90190,90121,90812,9


# Compute the per chapter accuracy and display as dataframe

* Use group by per chapter to compute chapter specific accuracy

In [157]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, plot_confusion_matrix, accuracy_score


def calculate_accuracy(x) :
    y1 = accuracy_score(list(x['label']), x['first'])
    y2 = accuracy_score(list(x['label']), x['second'])
    y3 = accuracy_score(list(x['label']), x['third'])
    y4 = accuracy_score(list(x['label']), x['fourth'])
    y5 = accuracy_score(list(x['label']), x['fifth'])
    y_total = y1 + y2 + y3 + y4 + y5 
    return y1, y2, y3, y4, y5, y_total

k = final_df.groupby(['chapter']).apply(lambda x : calculate_accuracy(x))

In [197]:
per_chapter_df = (pd.DataFrame(k.values.tolist(), index=k.index)
                    .reset_index(level=0)
                    .rename({0 : 'first', 1 : 'second', 2 : 'third', 3 : 'forth', 4 : 'fifth', 5 : 'total'}, axis=1)
                 )
per_chapter_df['class per chapter']  = per_chapter_df.chapter.apply(lambda x : per_chapter_labels[x])

In [230]:
pd.set_option('display.max_rows',100)
per_chapter_df.style.apply(lambda x : ['background-color: yellow' if v else '' for v in (x == 5)])

Unnamed: 0,chapter,first,second,third,forth,fifth,total,class per chapter
0,9,0.784533,0.095947,0.03598,0.02316,0.008685,0.948304,39
1,10,0.912639,0.048327,0.011152,0.004647,0.003717,0.980483,23
2,11,0.831361,0.068047,0.023669,0.010355,0.005424,0.938856,27
3,12,0.836148,0.063441,0.02419,0.010497,0.007303,0.941579,47
4,13,0.844221,0.072027,0.030151,0.025126,0.011725,0.98325,11
5,14,0.922857,0.042857,0.031429,0.0,0.002857,1.0,5
6,15,0.849427,0.047872,0.018412,0.011457,0.009002,0.93617,48
7,16,0.848131,0.065888,0.016822,0.011682,0.008411,0.950935,39
8,17,0.760207,0.100454,0.053791,0.033701,0.012962,0.961115,17
9,18,0.802126,0.094031,0.035977,0.029436,0.011447,0.973017,11
