## References & starter code

https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py

In [2]:
import numpy as np

import xgboost as xgb

import pandas as pd
from tqdm import tqdm

from scipy import sparse
from scipy import stats
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder 

In [3]:
xgb.__version__

'1.3.3'

In [1]:
!ls -d /data/work/shajikk/0308/

/data/work/shajikk/0308/


In [4]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
import re

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
ProgressBar().register()
import pickle

In [6]:
result_dd = dd.read_parquet('/data/work/shajikk/0308/data/full_cleaned')

In [7]:
all_labels = result_dd.label.unique()
print('Number of classes : ', len(all_labels))

[########################################] | 100% Completed |  3.7s
Number of classes :  4137


In [8]:
len(result_dd)

[########################################] | 100% Completed |  3.7s


1982138

In [9]:
result = result_dd.compute()

[########################################] | 100% Completed |  3.8s


## Lemmatizer

In [10]:
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

from nltk.tokenize import RegexpTokenizer
tok = RegexpTokenizer(r'\w+')
    
def clean_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    sentence = re.sub(r'\d+', '', sentence)
    remove_dig_pun = tok.tokenize(sentence.lower())

    nltk_tagged = nltk.pos_tag(remove_dig_pun)  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
            
    lemmatized_sentence_clean = list(map((lambda x : x if x not in stop else ""), lemmatized_sentence))
    input_clean = list(map((lambda x : x if x not in stop else ""), remove_dig_pun))

    return " ".join(input_clean + lemmatized_sentence_clean)

## Read the HTS descriptions, process

In [11]:
hts = pd.read_csv("/data/work/shajikk/0308/hts_train.csv", dtype={'hs': str, 'desc' : str})

In [12]:
hts['clean'] = hts.desc.apply(lambda x : clean_sentence(x))
hts = hts.rename({'hs' : 'label', 'clean' : 'text'}, axis=1)[['label', 'text']]

In [13]:
hts.head(1)

Unnamed: 0,label,text
0,10121,live horses asses mules hinnies horses purebr...


## Read the NACIS->HTS examples, process

In [14]:
nacis = pd.read_csv("/data/work/shajikk/0308/commodity_hts_extract.csv", dtype={'hts6': str, 'description_long' : str})

In [15]:
nacis['clean'] = nacis.description_long.apply(lambda x : clean_sentence(x))

In [16]:
nacis = nacis.rename({'hts6' : 'label', 'clean' : 'text'}, axis=1)[['label', 'text']]

In [17]:
nacis.head(1)

Unnamed: 0,label,text
0,910211,batteries wrist watches battery powered mech...


## Sample the full data, 4137 classes. Create a subset of 100 examples each per class. Save it off

In [18]:
all_train_df = []
all_valid_df = []
count = 200
recompute = False

if not recompute :
    with open('all_train_df_200.pkl', 'rb') as f: all_train_df = pickle.load(f)
    with open('all_valid_df_200.pkl', 'rb') as f: all_valid_df = pickle.load(f)
    pass


for c in tqdm(all_labels) :
    if (not recompute) : break
    df = result[result.label == c]
    df_sampled = df.sample(frac=min(count/len(df), 1))
    df_hts = hts[hts.label == c]
    df_nacis  = nacis[nacis.label == c]
    
    
    train_df  = df_sampled.sample(frac=0.8)
    valid_df = df_sampled.drop(train_df.index)

    all_train_df.append(train_df)
    all_train_df.append(df_hts)
    all_train_df.append(df_nacis)
    
    all_valid_df.append(valid_df)

train_df  = pd.concat(all_train_df)
valid_df  = pd.concat(all_valid_df)

if recompute :
    with open('all_train_df_200.pkl', 'wb') as f: pickle.dump(all_train_df, f)
    with open('all_valid_df_200.pkl', 'wb') as f: pickle.dump(all_valid_df, f)
    pass

[########################################] | 100% Completed |  3.8s


  0%|          | 0/4137 [00:00<?, ?it/s]

[########################################] | 100% Completed |  3.7s


In [19]:
len(valid_df['text']), len(train_df['text']), len(train_df['text'])/40000

(125227, 525877, 13.146925)

## Construct count vectorizer with HTS, NACIS keywords

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vector = CountVectorizer(max_features=30000)
count_vector.fit(list(hts['text']) + list(nacis['text']))

CountVectorizer(max_features=30000)

In [21]:
test = count_vector.transform(list(hts['text']) + list(nacis['text']))
test.shape

(34707, 14588)

In [22]:
X_train_counts = count_vector.transform(list(train_df['text']))
X_train_counts.shape

(525877, 14588)

In [23]:
label_enc = LabelEncoder() 
label_enc.fit(result['label']) 
y_train = np.expand_dims(np.array(label_enc.transform(train_df['label'])), 1)

In [24]:
X_train_counts.shape, y_train.shape

((525877, 14588), (525877, 1))

In [25]:
pickle.dump(count_vector, open("count_vector.pickle", "wb"))

## Batch logic for GBT

In [26]:
def batch(csr, y, rows, random_row_array, n=1):
    l = len(rows)
    for ndx in range(0, l, n):
        yield (csr[random_row_array[ndx:min(ndx + n, l)]].todense(), 
               y[random_row_array[ndx:min(ndx + n, l)]])

## Enable batching class

In [27]:
class make_model():
    def __init__(self, param, lr, num_round = 5, batch_size=1000):
        self.param     = param
        self.num_round = num_round
        self.batch_size = batch_size
        self.lr = lr
        
    def fit(self, csr, y_val):
        iteration = 0
        print("Will run for {} rounds".format(self.num_round))
        for n_round in range(0, self.num_round):     
            random_row_array = np.random.choice(np.arange(csr.shape[0]), csr.shape[0], replace=False)
            rows = range(0, csr.shape[0])
            with tqdm(total=int(len(rows)/self.batch_size)) as progress_bar:
                for x,y in batch(csr, y_val, rows, random_row_array, self.batch_size):
                    dtrain = xgb.DMatrix(x, y)
                    watchlist = [(dtrain,'train')]

                    if iteration == 0 : model = xgb.Booster(self.param, [dtrain])
                    
                    self.param['eta'] = self.lr[iteration]
                    print('Round = {}, Iteration = {}, lr = {}'.format(n_round, iteration, self.lr[iteration]))
                    
                    model = xgb.train(self.param, dtrain, num_boost_round=1, xgb_model=model, evals=watchlist)
                    iteration = iteration + 1
                    progress_bar.update(1)
            if n_round > 4 :
                name = 'xgb_model_v2_{}'.format(n_round)
                print("saving model: ", name)
                model.save_model(name)
                
        self.model  = model

## Carefully adjust Learning rate for each iteration so that training converges (else it won't work)

In [28]:
#lr =  [0.3]*100 + [0.1] * 20 + [0.05]*100 
lr =  [0.45]*14*2 + [0.4]*14*2 + [0.3] * 14 * 2 + [0.2]* 14 * 1 +  [0.1]* 14 * 1 +  [0.05]* 14 * 100

parameters = {'max_depth':5, 'objective':'multi:softprob', 'subsample':0.8, 
            'colsample_bytree':0.8, 'eta': 0.3, 'min_child_weight':0.1,
            'tree_method':'gpu_hist', 'gpu_id': 0, 'num_class' : len(all_labels)
            }

model = make_model(parameters, lr, num_round=8, batch_size=40000) 
model.fit(X_train_counts, y_train)

[########################################] | 100% Completed |  3.8s


## Load the saved, trained model for further experiments

In [29]:
model.model = xgb.Booster(model_file='xgb_model_v2_5')

## Do prediction in batches, else it will crash with out of memory errors

In [83]:
def batch(lst, n=1):
    l = len(lst)
    for ndx in range(0, l, n):
        yield lst[ndx:min(ndx + n, l)]

def do_predict_batch(input):
    first  = []
    second = []
    third  = []
    fourth = []
    fifth  = []
    
    with tqdm(total=len(valid_df['text'])) as progress_bar:
        for x in batch(input, 2000):
            tmp_valid_counts = count_vector.transform(x)
            tmp_predict_da = model.model.predict(xgb.DMatrix(tmp_valid_counts.todense()))
            sorted_idx = np.argsort(-tmp_predict_da)
            first = first + list(label_enc.inverse_transform(list(sorted_idx[:,0])))
            second = second + list(label_enc.inverse_transform(list(sorted_idx[:,1])))
            third = third + list(label_enc.inverse_transform(list(sorted_idx[:,2])))
            fourth = fourth + list(label_enc.inverse_transform(list(sorted_idx[:,3])))
            fifth = fifth + list(label_enc.inverse_transform(list(sorted_idx[:,4])))
            progress_bar.update(2000)
        return first, second, third, fourth, fifth

y1, y2, y3, y4, y5 = do_predict_batch(list(valid_df['text']))

  0%|          | 0/125227 [00:00<?, ?it/s]

## Calculate accuracy for top 5 classes

In [87]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, plot_confusion_matrix, accuracy_score

a1 = accuracy_score(list(valid_df['label']), y1)
a2 = accuracy_score(list(valid_df['label']), y2)
a3 = accuracy_score(list(valid_df['label']), y3)
a4 = accuracy_score(list(valid_df['label']), y4)
a5 = accuracy_score(list(valid_df['label']), y5)

print('Accuracy:', a1, a2, a3, a4, a5)
print('Total:', a1+a2+a3+a4+a5)

Accuracy: 0.5625144737157323 0.09157769490605061 0.037811334616336734 0.022191699873030577 0.01474122992645356
Total: 0.7288364330376037


In [None]:
import time
time.sleep(30)
print('Terminate Instance')
!aws ec2 terminate-instances --instance-ids i-0f41741a0c8b12972