In [1]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [30]:
import pandas as pd
import numpy as np
import re
import dask.dataframe as ddf
from math import nan
import panel as pn
import dask
import dask.dataframe as dd
import seaborn as sns
import plotly as pty
import plotly.express as px
import calendar
import matplotlib.pyplot as plt

from dask.diagnostics import ProgressBar
ProgressBar().register()

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
np.random.seed(500)

In [4]:
!ls /data/common/trade_data/2019/data_samples

sample_87128714.parq


In [5]:
sample_df_parq = dd.read_parquet('/data/common/trade_data/2019/data_samples/sample_87128714.parq', engine='fastparquet', chunksize="100MB")

In [6]:
len(sample_df_parq)

[########################################] | 100% Completed |  2.3s


7269

In [7]:
sampled_df = sample_df_parq.compute()

[########################################] | 100% Completed |  0.1s


In [8]:
sampled_df.head(3)

Unnamed: 0,System Identity Id,Estimate Arrival Date,Actual Arrival Date,Bill of Lading,Master Bill of Lading,Bill Type Code,Carrier SASC Code,Vessel Country Code,Vessel Code,Vessel Name,...,Product Desc,Marks & Numbers,HS Code Sure Level,CIF,Indicator of true supplier,Indicator of true buyer,END,HS Code,HS_Code,Merged_Description
0,6003201907090000254809,20190705,20190705,CHSLTPE19060165,EGLV003901609611,H,"CHSL, CHISLEY MOTOR COACHES",PA,9306990,OOCL VANCOUVER,...,"ELLIPTICAL TRAINER,TREADMILL, ESCALATE<br/>",NO MARKS<br/>,5,0.0,Y,Y,END,871200,871200,Bicycles and other cycles (including delivery ...
1,6003201901040000381477,20181230,20190103,AMAWSHNGBA807064,APLUNPFB000624,H,AMAW,CN,APL BARCELONA,APL BARCELONA,...,SCH 830 TREADMILL<br/>,AS ADDRESSED<br/>,5,0.0,Y,Y,END,871200,871200,Bicycles and other cycles (including delivery ...
2,6003201904300000373799,20190425,20190429,SFOKNGB19040368,OOLU2106406710,H,SFOK,PA,9168855,EVER URANUS,...,FITNESSMACHINE; TREADMILL<br/>,NO MARKS<br/>,5,0.0,Y,Y,END,871200,871200,Bicycles and other cycles (including delivery ...


In [9]:
random_seed = 1
all_classes = sampled_df.HS_Code.unique()
print('DEBUG', all_classes)


DEBUG ['871200' '871410' '871420' '871491' '871492' '871493' '871494' '871495'
 '871496' '871499']


In [10]:
sampled_df = sampled_df[['HS_Code', 'Product Desc', 'Merged_Description']].rename({'HS_Code' : 'label', 'Product Desc' : 'text'}, axis=1)

In [11]:
sampled_df

Unnamed: 0,label,text,Merged_Description
0,871200,"ELLIPTICAL TRAINER,TREADMILL, ESCALATE<br/>",Bicycles and other cycles (including delivery ...
1,871200,SCH 830 TREADMILL<br/>,Bicycles and other cycles (including delivery ...
2,871200,FITNESSMACHINE; TREADMILL<br/>,Bicycles and other cycles (including delivery ...
3,871200,COMPLETE BICYCLES & BICYCLE PARTS & ELECTRIC B...,Bicycles and other cycles (including delivery ...
4,871200,LOUISIANA GRILLS LG900 W FRONT SHELF<br/>,Bicycles and other cycles (including delivery ...
...,...,...,...
7264,871499,HANDLEBAR/SEATPOST BICYCLE PARTS<br/>,Parts and accessories of vehicles of headings ...
7265,871499,"SHIPPER S LOAD, COUNT & SEAL (74P KGS) CY / CY...",Parts and accessories of vehicles of headings ...
7266,871499,BICYCLE ACCESSORIES BC SEAT PAD GELCORE REG WS...,Parts and accessories of vehicles of headings ...
7267,871499,ROTOR & WING<br/>,Parts and accessories of vehicles of headings ...


In [12]:
# Step - a : Remove blank rows if any.
sampled_df['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
sampled_df['text'] = [entry.lower() for entry in sampled_df['text']]
sampled_df['Merged_Description'] = [entry.lower() for entry in sampled_df['Merged_Description']]
sampled_df['text'] = [entry.replace('<br/>', '') for entry in sampled_df['text']]

# # Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
sampled_df['text']= [word_tokenize(entry) for entry in sampled_df['text']]
sampled_df['Merged_Description']= [word_tokenize(entry) for entry in sampled_df['Merged_Description']]

In [13]:
sampled_df['text'].dtype

dtype('O')

In [14]:
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(sampled_df['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
#             Final_words.append(word) # No Lemmatization
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)

    # The final processed set of words for each iteration will be stored in 'Merged_final'
    sampled_df.loc[index,'text_final'] = str(Final_words)
    
# for index,entry in enumerate(sampled_df['Merged_Description']):
#     # Declaring Empty List to store the words that follow the rules for this step
#     Final_words = []
#     # Initializing WordNetLemmatizer()
#     word_Lemmatized = WordNetLemmatizer()
#     # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
#     for word, tag in pos_tag(entry):
#         # Below condition is to check for Stop words and consider only alphabets
#         if word not in stopwords.words('english') and word.isalpha():
#             word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
#             Final_words.append(word_Final)
#     # The final processed set of words for each iteration will be stored in 'text_final'
#     sampled_df.loc[index,'Merged_final'] = str(Final_words)    

In [16]:
for c in all_classes :
    df = sampled_df[sampled_df.label == c]
    Train_X, Val_X, Train_Y, Val_Y = model_selection.train_test_split(df['text_final'],df['label'],test_size=0.3)
    print(c,len(Train_X), len(Val_X))

871200 700 300
871410 554 238
871420 700 300
871491 221 95
871492 700 300
871493 700 300
871494 510 219
871495 43 19
871496 259 111
871499 700 300


In [17]:
Train_X, Val_X, Train_Y, Val_Y = pd.Series() , pd.Series() , pd.Series() , pd.Series()
for c in all_classes :
    df = sampled_df[sampled_df.label == c]
    T_X, V_X, T_Y, V_Y = model_selection.train_test_split(df['text_final'],df['label'],test_size=0.3)
    print(c,len(T_X), len(V_X))
    Train_X = Train_X.append(T_X, ignore_index=True)
    Train_Y = Train_Y.append(T_Y, ignore_index=True)
    Val_X = Val_X.append(V_X, ignore_index=True)
    Val_Y = Val_Y.append(V_Y, ignore_index=True)
    print(c,len(Train_X), len(Train_Y),len(Val_X), len(Val_Y))


871200 700 300
871200 700 700 300 300
871410 554 238
871410 1254 1254 538 538
871420 700 300
871420 1954 1954 838 838
871491 221 95
871491 2175 2175 933 933
871492 700 300
871492 2875 2875 1233 1233
871493 700 300
871493 3575 3575 1533 1533
871494 510 219
871494 4085 4085 1752 1752
871495 43 19
871495 4128 4128 1771 1771
871496 259 111
871496 4387 4387 1882 1882
871499 700 300
871499 5087 5087 2182 2182


  Train_X, Val_X, Train_Y, Val_Y = pd.Series() , pd.Series() , pd.Series() , pd.Series()


In [18]:
len(Train_X) + len(Val_X) == len(sampled_df)

True

In [19]:
Train_Y.unique()

array(['871200', '871410', '871420', '871491', '871492', '871493',
       '871494', '871495', '871496', '871499'], dtype=object)

In [20]:
Val_Y.unique()

array(['871200', '871410', '871420', '871491', '871492', '871493',
       '871494', '871495', '871496', '871499'], dtype=object)

In [21]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Val_Y = Encoder.fit_transform(Val_Y)

In [22]:
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(sampled_df['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Val_X_Tfidf = Tfidf_vect.transform(Val_X)
print(len(Tfidf_vect.vocabulary_))

5296


In [36]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Val_X_Tfidf)

# Metrics
print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, Val_Y)*100)

Naive Bayes Accuracy Score ->  81.02658111824014


In [37]:
print(classification_report(Val_Y,predictions_NB, target_names=sampled_df['label'].unique()))

              precision    recall  f1-score   support

      871200       0.70      0.85      0.77       300
      871410       0.81      0.89      0.85       238
      871420       0.84      0.87      0.85       300
      871491       1.00      0.35      0.52        95
      871492       0.88      0.87      0.87       300
      871493       0.82      0.91      0.86       300
      871494       0.94      0.77      0.84       219
      871495       0.00      0.00      0.00        19
      871496       1.00      0.52      0.69       111
      871499       0.71      0.83      0.77       300

    accuracy                           0.81      2182
   macro avg       0.77      0.69      0.70      2182
weighted avg       0.82      0.81      0.80      2182



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', decision_function_shape='ovo')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Val_X_Tfidf)

# Metrics
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Val_Y)*100)

SVM Accuracy Score ->  84.2346471127406


In [39]:
print(classification_report(Val_Y,predictions_SVM, target_names=sampled_df['label'].unique()))

              precision    recall  f1-score   support

      871200       0.78      0.84      0.81       300
      871410       0.84      0.88      0.86       238
      871420       0.77      0.90      0.83       300
      871491       0.79      0.57      0.66        95
      871492       0.92      0.89      0.91       300
      871493       0.95      0.89      0.92       300
      871494       0.89      0.84      0.86       219
      871495       0.75      0.32      0.44        19
      871496       0.98      0.75      0.85       111
      871499       0.76      0.81      0.79       300

    accuracy                           0.84      2182
   macro avg       0.84      0.77      0.79      2182
weighted avg       0.85      0.84      0.84      2182



In [25]:
## Train on merged description but predict on item description

In [26]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = [
              {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
              {'C': [0.1, 1, 10, 100, 1000],'kernel': ['linear']}
             ]
  
grid = GridSearchCV(svm.SVC(decision_function_shape='ovo'), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(Train_X_Tfidf,Train_Y)

# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_)


Fitting 5 folds for each of 13 candidates, totalling 65 fits
[CV 1/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   1.6s
[CV 2/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   1.6s
[CV 3/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   1.6s
[CV 4/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   1.6s
[CV 5/5] END ...................C=1, gamma=0.001, kernel=rbf; total time=   1.6s
[CV 1/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   1.5s
[CV 2/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   1.5s
[CV 3/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   1.5s
[CV 4/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   1.5s
[CV 5/5] END ..................C=1, gamma=0.0001, kernel=rbf; total time=   1.5s
[CV 1/5] END ..................C=10, gamma=0.001, kernel=rbf; total time=   1.5s
[CV 2/5] END ..................C=10, gamma=0.001

In [43]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1000, kernel='rbf', decision_function_shape='ovo', gamma=0.001)
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Val_X_Tfidf)

# Metrics
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Val_Y)*100)

SVM Accuracy Score ->  84.6929422548121


In [44]:
print(classification_report(Val_Y,predictions_SVM, target_names=sampled_df['label'].unique()))

              precision    recall  f1-score   support

      871200       0.81      0.85      0.83       300
      871410       0.86      0.86      0.86       238
      871420       0.77      0.90      0.83       300
      871491       0.81      0.60      0.69        95
      871492       0.92      0.89      0.90       300
      871493       0.93      0.90      0.91       300
      871494       0.88      0.84      0.86       219
      871495       0.82      0.47      0.60        19
      871496       0.99      0.77      0.86       111
      871499       0.77      0.83      0.80       300

    accuracy                           0.85      2182
   macro avg       0.86      0.79      0.81      2182
weighted avg       0.85      0.85      0.85      2182



In [46]:
!whoami

root
