In [19]:
!pip install -U scikit-learn
!pip install graphviz
!pip install transformers
!pip install bert
!pip install bert-tensorflow
!pip install keras
!pip install dask_ml
!pip install xgboost



In [38]:
import pandas as pd
import numpy as np
import re
import dask.dataframe as ddf
from math import nan
import panel as pn
import dask
import dask.dataframe as dd
import seaborn as sns
import plotly as pty
import plotly.express as px
import calendar
import matplotlib.pyplot as plt

from dask.diagnostics import ProgressBar
ProgressBar().register()

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('stopwords')
stop = stopwords.words('english')

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from collections import defaultdict

from xgboost import XGBClassifier

import pickle

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
pd.set_option("display.max_colwidth", None)
np.random.seed(500)

## Helper functions

In [22]:
def text_preprocessing(dataframe_column, arguments):
    """
    Takes a pandas series df['columnname'] and applies various text preprocessing methods
    passed to function as list.
    Returns a pandas data series as new column
    """
#     print(type(dataframe_column))
    new_series = dataframe_column
    if 'lower' in arguments:
        new_series = new_series.str.lower()
    # Remove stopwords and convert back to string
    if 'remove_stopwords' in arguments:
        new_series = new_series.apply(lambda x: [item for item in x.split() if item not in stop]).str.join(" ")
    # Remove HS Codes from Product Description as it is supposed to be predicted
    if 'remove_HS_Codes' in arguments:
        new_series = [re.sub('\d{4,}', '', x) for x in new_series]
    return new_series


In [23]:
def train(classifier, X, y, X_test, y_test, labels=None, classifier_name='unnamed'):
    ### provide classifier, train and test set
    ### get train/val split
    ### fit on val
    ### test on test
    ### return accuracy score for test
    
#     print("validation results:")
    classifier.fit(X, y)
#     print(classification_report(y_val, classifier.predict(X_val)))
    
    print("Dev set results:")
    X_test_preds = classifier.predict(X_test)
    # save the classifier
    with open(classifier_name + '.pkl', 'wb') as fid:
        pickle.dump(classifier, fid)    

#     print(classification_report(y_test, X_test_preds) )
    # Plot non-normalized confusion matrix
    cm = confusion_matrix(y_test, X_test_preds)
#     print(cm)
    return [accuracy_score(y_test, X_test_preds), precision_score(y_test, X_test_preds, average="weighted"), recall_score(y_test, X_test_preds, average="weighted")]
# f1_score(y_dev, predicted, average="weighted")



## Load the PARQ data files and save to pandas dataframe

In [24]:
sample_df_parq = dd.read_parquet('/data/common/trade_data/2019_updated/data_samples/sample_chap39_40.parq', engine='fastparquet', chunksize="100MB")

In [25]:
len(sample_df_parq)

[########################################] | 100% Completed |  0.9s
[########################################] | 100% Completed |  1.0s


134890

In [26]:
sampled_df = sample_df_parq.compute()

[########################################] | 100% Completed |  0.8s
[########################################] | 100% Completed |  0.9s


In [27]:
# Find length of Product Description free text
sampled_df['Product_Desc_Length'] = sampled_df['Product Desc'].str.len()

In [28]:
sampled_df[sampled_df['Product_Desc_Length']>1000][['Product Desc','Product_Desc_Length']].head(3)

Unnamed: 0,Product Desc,Product_Desc_Length
8,173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>173.250 LLDPE 218WJ 148 H.S. CODE: 3901.10 DELIVERY NO:80 1742665 SALES ORDER NO: 4503 669596 CARGO NET WEIGHT: 173 .250 MT CARGO GROSS WEIGHT: 177.065 MT CONTR TARE WEIGHT : 28.000 MT TOTAL GROSS WEIG HT: 205.065 MT TOTAL PALLETS : 126.00 AGENT AT DESTINATI ON: MAERSK PERU S.A. - ALCON SA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CALLAO 100 CALLAO PERU PHO NE: 51 1 6140050 FAX: 51 1 5776153<br/>,2870
10,"POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - HDPE WHI0650<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003 ==================== 8,910 (25KG) BAGS LOADED INTO 9 X 40 CONTAINERS LD PE ZLF003 (198.000 MT / 7,920 BAGS) - HS CODE390110 HDPE WHI0650 (24.750 MT / 990 BAGS) - HS CODE 390120 21 DAYS FREE TIME AT DESTINAT ION FREIGHT PREPAID ========================= = ========================== ======================= MEDITERRANEAN SHIPPING COMPANY (SHANGHAI) LIMITED - NINGBO BRANCH NO.88, CHANGSHA BAY, BAIZHONG ROAD BAIFENG BEILUN 315813 NINGBO, CHINA PHONE+86 574 8672 5042 FAX+86 574 2769 8762<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>POLYMERS OF ETHYLENE, IN PRIMARY FORMSI. PRIMARY - LDPE ZLF003<br/>",1091
13,74.250 MT LLDPE 118NJ 148 DE LIVERY NO:800833132 SALES OR DER NO:4503354988 CARGO NET WEIGHT: 74.250 MT CARGO GROS S WEIGHT: 75.870 MT CONTAINE R TARE WEIGHT: 12.000 MT TOT AL GROSS WEIGHT:87.870 MT H. S.CODE 3901.10 TOTAL PALLETS 54 AGENT AT DESTINATION: MAERSK PERU S.A. - ALCONSA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CA LLAO 100 CALLAO PERU PHONE: 51 1 6140050 FAX: 51 1 5776153<br/>74.250 MT LLDPE 118NJ 148 DE LIVERY NO:800833132 SALES OR DER NO:4503354988 CARGO NET WEIGHT: 74.250 MT CARGO GROS S WEIGHT: 75.870 MT CONTAINE R TARE WEIGHT: 12.000 MT TOT AL GROSS WEIGHT:87.870 MT H. S.CODE 3901.10 TOTAL PALLETS 54 AGENT AT DESTINATION: MAERSK PERU S.A. - ALCONSA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CA LLAO 100 CALLAO PERU PHONE: 51 1 6140050 FAX: 51 1 5776153<br/>74.250 MT LLDPE 118NJ 148 DE LIVERY NO:800833132 SALES OR DER NO:4503354988 CARGO NET WEIGHT: 74.250 MT CARGO GROS S WEIGHT: 75.870 MT CONTAINE R TARE WEIGHT: 12.000 MT TOT AL GROSS WEIGHT:87.870 MT H. S.CODE 3901.10 TOTAL PALLETS 54 AGENT AT DESTINATION: MAERSK PERU S.A. - ALCONSA AV. NESTOR GAMBETTA S/N KM 14.5 CARRETERA VENTANILLA CA LLAO 100 CALLAO PERU PHONE: 51 1 6140050 FAX: 51 1 5776153<br/>,1206


In [29]:
sampled_df['Product_Desc_Length'].describe()

count    134890.000000
mean        353.686589
std        2060.392594
min           8.000000
25%          68.000000
50%         151.000000
75%         338.000000
max      429170.000000
Name: Product_Desc_Length, dtype: float64

In [30]:
random_seed = 1
all_classes = sampled_df.HS_Code.unique()
print('DEBUG', all_classes)

DEBUG [390110 390120 390130 390140 390190 390210 390220 390230 390290 390311
 390319 390320 390330 390390 390410 390421 390422 390430 390440 390450
 390461 390469 390490 390512 390519 390521 390529 390530 390591 390599
 390770 390791 390799 390610 390690 390710 390720 390730 390740 390750
 390761 390769 390810 390890 390910 390920 390931 390939 390940 390950
 391000 391110 391190 391211 391212 391220 391231 391239 391290 391310
 391390 391400 391510 391520 391530 391590 391610 391620 391690 391710
 391721 391722 391723 391729 391731 391732 391733 391739 391740 391810
 391890 391910 391990 392010 392020 392030 392043 392049 392051 392059
 392061 392062 392063 392069 392071 392073 392079 392091 392092 392093
 392094 392099 392111 392112 392113 392114 392119 392190 392210 392220
 392290 392310 392321 392329 392330 392340 392350 392390 392410 392490
 392510 392520 392530 392590 392610 392620 392630 392640 392690 400110
 400121 400122 400129 400130 400211 400219 400220 400231 400239 400241


In [31]:
new_df = sampled_df[['HS_Code', 'Product Desc', 'Product_Desc_Length', 'Merged_Description']].rename({'HS_Code' : 'label', 'Product Desc' : 'text'}, axis=1)

In [32]:
new_df.head(2)

Unnamed: 0,label,text,Product_Desc_Length,Merged_Description
0,390110,PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS<br/>PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS<br/>PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS<br/>PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS<br/>PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS<br/>PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS<br/>PACKAGE VISICO LE4421 1/2 SIZE OCT ABIN LD POLYETHYLENE =36 OCTABINS HS CODE: 390110 FREIGHT PREPAID<br/>,543,"Polymers of ethylene, in primary forms ;Polyethylene having a specific gravity of less than 0.94 ;Having a relative viscosity of 1.44 or more;Other;Linear low density polyethylene;Low density polyethylene, except linear low density polyethylene;Medium density polyethylene"
1,390110,"POLYETHYLENE COMMODITY CODE 390110 NET WE IGHT 21.420,0000 KG<br/>",66,"Polymers of ethylene, in primary forms ;Polyethylene having a specific gravity of less than 0.94 ;Having a relative viscosity of 1.44 or more;Other;Linear low density polyethylene;Low density polyethylene, except linear low density polyethylene;Medium density polyethylene"


In [33]:
# new_df['text'] = text_preprocessing(new_df['text'], ['remove_HS_Codes'])

#### Train - Test split

In [34]:
Train_X, Val_X, Train_y, Val_y = model_selection.train_test_split(new_df['text'], new_df['label'], test_size=0.25, random_state=1, stratify=new_df['label'])

In [35]:
print("Train : ", len(Train_X), len(Train_y))
print("Val : ", len(Val_X), len(Val_y))

Train :  101167 101167
Val :  33723 33723


## Model Testing

In [36]:
column_names = ['Model Name', 'Accuracy', 'Precision', 'Recall']

results = pd.DataFrame(columns = column_names)

results

Unnamed: 0,Model Name,Accuracy,Precision,Recall


### With Pipeline 

#### Multinomial Naive Bayes

In [39]:
model1a = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB()),
])
 
acc = train(model1a, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique(), classifier_name='MNB_CNTVect_Chp39_40')

results = results.append(pd.DataFrame([['MNB_CNTVect'] + acc], columns=column_names), ignore_index=True)

Dev set results:


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model1b = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', MultinomialNB()),
])
 
acc = train(model1b, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique(),classifier_name='MNB_CNTVect_NoSW_chp39_40')

results = results.append(pd.DataFrame([['MNB_CNTVect_NoSW'] + acc], columns=column_names), ignore_index=True)

Dev set results:


In [None]:
model1c = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', MultinomialNB()),
])
 
acc = train(model1c, text_preprocessing(Train_X, ['remove_HS_Codes' ]), Train_y, text_preprocessing(Val_X, ['remove_HS_Codes']), Val_y,
            labels = new_df['label'].unique(),
           classifier_name='MNB_CNTVect_NoSW_NoHSCode_chp39_40')

results = results.append(pd.DataFrame([['MNB_CNTVect_NoSW_NoHSCode'] + acc], columns=column_names), ignore_index=True)

In [None]:
results

In [None]:
model2a = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB()),
])
 
acc = train(model2a, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique(),
           classifier_name='MNB_TFIDFVect_Chp39_40')

results = results.append(pd.DataFrame([['MNB_TFIDFVect'] + acc], columns=column_names), ignore_index=True)

In [None]:
model2b = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words = 'english')),
    ('classifier', MultinomialNB()),
])
 
acc = train(model2b, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['MNB_TFIDFVect_NoSW'] + acc], columns=column_names), ignore_index=True)

In [None]:
model2c = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words = 'english')),
    ('classifier', MultinomialNB()),
])
 
acc = train(model2c, text_preprocessing(Train_X, ['remove_HS_Codes' ]), Train_y, text_preprocessing(Val_X, ['remove_HS_Codes']), Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['MNB_TFIDVect_NoSW_NoHSCode'] + acc], columns=column_names), ignore_index=True)

In [None]:
results

### XGBoost

In [None]:
model3a = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', XGBClassifier()),
])
 
acc = train(model3a, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['XGBoost_CNTVect'] + acc], columns=column_names), ignore_index=True)

In [None]:
model3b = Pipeline([
    ('vectorizer', CountVectorizer(stop_words='english')),
    ('classifier', XGBClassifier()),
])
 
acc = train(model3b, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['XGBoost_CNTVect_NoSW'] + acc], columns=column_names), ignore_index=True)

In [None]:
model3c = Pipeline([
    ('vectorizer', CountVectorizer(stop_words = 'english')),
    ('classifier', XGBClassifier()),
])
 
acc = train(model3c, text_preprocessing(Train_X, ['remove_HS_Codes' ]), Train_y, text_preprocessing(Val_X, ['remove_HS_Codes']), Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['XGBoost_CNTVect_NoSW_NoHSCode'] + acc], columns=column_names), ignore_index=True)

In [None]:
model4a = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', XGBClassifier()),
])
 
acc = train(model4a, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['XGBoost_TFIDFVect'] + acc], columns=column_names), ignore_index=True)

In [None]:
model4b = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words = 'english')),
    ('classifier', XGBClassifier()),
])
 
 
acc = train(model4b, Train_X, Train_y, Val_X, Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['XGBoost_TFIDFVect_NoSW'] + acc], columns=column_names), ignore_index=True)

In [None]:
model4c = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words = 'english')),
    ('classifier', XGBClassifier()),
])
 
acc = train(model4c, text_preprocessing(Train_X, ['remove_HS_Codes' ]), Train_y, text_preprocessing(Val_X, ['remove_HS_Codes']), Val_y, labels = new_df['label'].unique())

results = results.append(pd.DataFrame([['XGBoost_TFIDVect_NoSW_NoHSCode'] + acc], columns=column_names), ignore_index=True)

In [None]:
print('Results from Chapter 30 and Chapeter 40 dataset\n')
results

### Save Results

In [None]:
current_dt = datetime.datetime.now()
# dd/mm/YY H:M:S
dt_string = current_dt.strftime("%d%m%Y_%H%M")
print(dt_string)

In [None]:
results.to_csv('ModelResults_Chp39_40_'+ '02282021' + '.csv', index=False)