# Naive Bayes

In [3]:
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, plot_confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np

In [4]:
import dask.dataframe as dd
random_seed = 1

In [5]:
sampled_df = (dd.read_parquet('/data/common/trade_data/2019_updated/data_samples/sample_chap39_40.parq')
              .compute()
              .rename({'HS_Code' : 'label', 'Product Desc' : 'text'}, axis=1)
              .astype({'label': 'str', 'text': 'str'})[['label', 'text']])
              

In [7]:
all_classes = sampled_df.label.unique()
sampled_df = sampled_df[['label', 'text']]
sampled_df.text = sampled_df.text.apply(lambda x : x.lower())
sampled_df.text = sampled_df.text.apply(lambda x : re.sub(r'<br/>.*', '', x))
sampled_df.text = sampled_df.text.apply(lambda x : re.sub(r'\d{4,}.\d\d', 'xxxx', x))
sampled_df.text = sampled_df.text.apply(lambda x : re.sub(r'\d{4,}.\d', 'xxxx', x))
sampled_df.text = sampled_df.text.apply(lambda x : re.sub(r'\d{4,}', 'xxxx', x))

In [8]:
sampled_df.columns

Index(['label', 'text'], dtype='object')

# Train and test split

In [9]:
X_train, X_dev, y_train, y_dev = train_test_split(sampled_df, sampled_df['label'], test_size=0.2, random_state=91, stratify=sampled_df['label'])

In [10]:
count_vector = TfidfVectorizer(max_features=30000)
X_train_counts = count_vector.fit_transform(X_train['text'])

In [11]:
clf1 = MultinomialNB().fit(X_train_counts, y_train)

In [12]:
X_new_counts = count_vector.transform(X_dev['text'])
predicted = clf1.predict(X_new_counts)

# Print the scores


In [13]:
print('F1 score:', f1_score(y_dev, predicted, average="macro"))
print('Precision:', precision_score(y_dev, predicted, average="macro"))
print('Recall:', recall_score(y_dev, predicted, average="macro"))

F1 score: 0.5410379313686002
Precision: 0.6777918647235479
Recall: 0.5267339535600387


  _warn_prf(average, modifier, msg_start, len(result))


# Do a sample prediction 


In [15]:
all_classes = np.array(list(clf1.classes_))
convert = lambda x: all_classes[x]

def get_sample_prediction(text, num_samples=20) :
    cv = count_vector.transform(pd.Series(data=[text]))
    prob = clf1.predict_proba(cv)
    fist_one = prob[0]
    sorted_idx = np.argsort(-fist_one)
    sorted_values = fist_one[sorted_idx]
    return pd.DataFrame({'idx' : convert(sorted_idx), 'probablity' : sorted_values}).head(num_samples)

In [16]:
get_sample_prediction("Hello", 5)

Unnamed: 0,idx,probablity
0,392030,0.015141
1,400251,0.009406
2,392113,0.007936
3,391990,0.00759
4,390130,0.007501


# Save pickle


In [19]:
import pickle

pickle.dump(count_vector, open("count_vector.pickle", "wb"))
pickle.dump(clf1, open("nv_clf.pickle", "wb"))