<a href="https://colab.research.google.com/github/amit-timalsina/Coding-assignment/blob/master/Prediction_on_product_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
except Exception:
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

%load_ext tensorboard

# Common imports
import numpy as np
import os
import pandas as pd
import sklearn

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
try:
  from google.colab import files
  files.upload()

except:
  print('No module files')

Saving test_set.csv to test_set.csv
Saving train_set.csv to train_set.csv


In [None]:
df = pd.read_csv('train_set.csv', encoding='latin-1')
df.head()

Unnamed: 0,label,text
0,85389000,pdscpm gb part of panel of chiller
1,85389000,nm p economical extended rot hand parts for c...
2,85389000,lv ma pd trip unit for cvs parts of circuit br...
3,85389000,lv na p trip unit for cvs switch parts of circ...
4,85389000,lv tmd pd trip unit for cvs parts of circuitbr...


In [None]:
df['label'].value_counts()

85389000    2936
85177090    2581
85369090    2438
39269099    2189
73181500    2033
85366990    1944
85238090    1720
85364900    1714
87089900    1673
33041000    1605
87082900    1451
84713010    1331
Name: label, dtype: int64

### Text preprocessing and Cleanup

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(df['label'])
df['label'] = encoder.transform(df['label'])
encoder.classes_

array([33041000, 39269099, 73181500, 84713010, 85177090, 85238090,
       85364900, 85366990, 85369090, 85389000, 87082900, 87089900])

In [None]:
inputs, labels = df['text'], df['label']

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(inputs, labels):
    X_train, y_train = inputs[train_index], labels[train_index]
    X_test, y_test = inputs[test_index], labels[test_index]

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)
# print(X_train.shape, X_test.shape)

In [None]:
# Lets do some cleaning of this text
def clean_it(text):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).strip().replace('  ',' ').replace('   ', ' ').replace('    ', ' ').lower()
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_data(data):
    # Defining the new data
    text = data
    # cleaning it
    text = text.apply(lambda x: clean_it(x))
    return text

In [None]:
X_train_cleaned = clean_data(X_train)
X_test_cleaned = clean_data(X_test)
X_train_cleaned.head()

21277                                          frzhaemblem
12481    afd pn  mswp zpf connector list sl no not foe ...
5510         mobile lcd  unbranded integralpophone woboard
984      fxsmtes base unit programmable controller japa...
16802                     issue esitronic  dvd a part no p
Name: text, dtype: object

We are using Bag of 1 and 2 gram for vectorization.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train_cleaned)
X_test_dtm = vect.transform(X_test_cleaned)
X_train_dtm.shape, X_test_dtm.shape

((18892, 39294), (4723, 39294))

### Model selection

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython "magic command")
y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm

CPU times: user 27.2 ms, sys: 9.99 ms, total: 37.2 ms
Wall time: 38.5 ms


In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_class)

0.9129790387465594

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, y_pred_class, average='macro')

0.9178137118885111

In [None]:
y_pred_score = nb.predict_proba(X_test_dtm)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_pred_score, multi_class='ovo')

0.9932879119012563

In [None]:
from sklearn.model_selection import cross_val_score

nb_scores = cross_val_score(nb, X_train_dtm, y_train, scoring='f1_macro', cv=5)

In [None]:
def display_scores(scores):
    print('Scores', scores)
    print('Mean:', scores.mean())
    print('Std:', scores.std())
display_scores(nb_scores)

Scores [0.90861662 0.89976967 0.90494859 0.91391525 0.91617767]
Mean: 0.9086855579384153
Std: 0.005948850889121619


In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_class)

array([[321,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  3, 351,  23,   0,   4,   0,   0,   6,   6,  15,  12,  18],
       [  0,   9, 367,   0,  10,   0,   1,   1,   1,   8,   0,  10],
       [  0,   3,   0, 263,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   2,   7,   0, 499,   0,   0,   0,   0,   6,   0,   2],
       [  0,   1,   0,   0,   0, 341,   0,   0,   1,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0, 332,   1,   6,   3,   1,   0],
       [  0,   6,   3,   0,   1,   0,   6, 336,  21,  13,   0,   3],
       [  0,   3,   2,   0,   4,   0,  38,  29, 399,  10,   0,   2],
       [  0,  11,   7,   0,   1,   0,   1,  10,  20, 531,   3,   3],
       [  0,   3,   1,   0,   0,   0,   0,   0,   1,   0, 270,  15],
       [  0,   7,  10,   0,   2,   0,   0,   0,   1,   3,  10, 302]])

Everything looks good till now

This is the baseline model

In [None]:
# Reducing the dimensionality in feature vectors
vect = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=20000)
X_train_dtm = vect.fit_transform(X_train_cleaned)
X_test_dtm = vect.transform(X_test_cleaned)
X_train_dtm.shape, X_test_dtm.shape

((18892, 20000), (4723, 20000))

In [None]:
nb = MultinomialNB() # instantiate a Multinomial Naive Bayes model
%time nb.fit(X_train_dtm, y_train) # train the model(timing it with an IPython "magic command")
y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm
print("F1 score: ", f1_score(y_test, y_pred_class, average='macro'))

CPU times: user 21.9 ms, sys: 385 µs, total: 22.3 ms
Wall time: 22.4 ms
F1 score:  0.9062595618278818


In [None]:
nb_scores = cross_val_score(nb, X_train_dtm, y_train, scoring='f1_macro', cv=5)
display_scores(nb_scores)

Scores [0.90937372 0.90327089 0.90471609 0.91413329 0.91727571]
Mean: 0.9097539387606577
Std: 0.005353694422537561


We can conclude that reducing the feature vector size with the Naive Bayes classifier wasn't useful for this dataset

#### Logistic Regression

In [None]:
vect = CountVectorizer(stop_words='english', ngram_range=(1, 2))
X_train_dtm = vect.fit_transform(X_train_cleaned)
X_test_dtm = vect.transform(X_test_cleaned)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(class_weight="balanced", max_iter=300) # instantiate a logistic regression model
%time log_reg.fit(X_train_dtm, y_train) # fit the model with training data

# Make predictions on train data
y_pred_class = log_reg.predict(X_test_dtm)

# calculate evaluation measures:
print("F1 score: ", f1_score(y_test, y_pred_class, average='macro'))
display_scores(cross_val_score(log_reg, X_train_dtm, y_train, scoring='f1_macro', cv=5))
# confusion_matrix(y_test, y_pred_class)

CPU times: user 19.5 s, sys: 18.8 s, total: 38.3 s
Wall time: 20 s
F1 score:  0.9493102099137518
Scores [0.94143276 0.93692653 0.93692777 0.94486279 0.94684242]
Mean: 0.9413984540698384
Std: 0.0040404112951348365


In [None]:
confusion_matrix(y_test, y_pred_class)

array([[321,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2, 395,   6,   0,   5,   0,   0,   5,   2,  13,   2,   8],
       [  0,   6, 384,   0,   2,   0,   0,   0,   0,  12,   0,   3],
       [  0,   3,   0, 262,   0,   0,   0,   0,   0,   1,   0,   0],
       [  0,   8,   1,   0, 501,   0,   0,   0,   0,   6,   0,   0],
       [  0,   2,   0,   0,   0, 341,   0,   0,   0,   1,   0,   0],
       [  0,   2,   0,   0,   0,   0, 333,   2,   0,   5,   1,   0],
       [  0,   7,   1,   0,   0,   0,   2, 349,  18,  11,   0,   1],
       [  0,   8,   3,   0,   1,   0,   7,  18, 438,  12,   0,   0],
       [  0,  15,   2,   0,   0,   0,   1,   4,   6, 559,   0,   0],
       [  0,   2,   0,   0,   0,   0,   0,   0,   0,   2, 271,  15],
       [  0,   9,   1,   0,   0,   0,   0,   0,   0,   8,   7, 310]])

I tried reducing the vector size but it didn't improve the performance. So, sparse feature set is not the reason behind these errors.

#### SVM

In [None]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(class_weight='balanced', max_iter=1500, ) # instantiate a Linear SVM model
%time svm_clf.fit(X_train_dtm, y_train, ) # fit the model with training data

# Make predictions on train data
y_pred_class = svm_clf.predict(X_test_dtm)

# calculate evaluation measures:
print("F1 score: ", f1_score(y_test, y_pred_class, average='macro'))
display_scores(cross_val_score(log_reg, X_train_dtm, y_train, scoring='f1_macro', cv=5))
# confusion_matrix(y_test, y_pred_class)

CPU times: user 4.89 s, sys: 6.33 ms, total: 4.89 s
Wall time: 4.87 s
F1 score:  0.9531366854417208
Scores [0.94143276 0.93692653 0.93692777 0.94486279 0.94684242]
Mean: 0.9413984540698384
Std: 0.0040404112951348365


In [None]:
confusion_matrix(y_test, y_pred_class, labels = np.arange(0, 12, 1))

array([[321,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2, 397,   7,   1,   5,   0,   0,   4,   3,   9,   3,   7],
       [  0,  12, 382,   0,   2,   0,   0,   0,   1,   7,   0,   3],
       [  0,   1,   0, 265,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   5,   1,   0, 505,   0,   0,   0,   0,   5,   0,   0],
       [  0,   2,   0,   0,   0, 341,   0,   0,   0,   1,   0,   0],
       [  0,   2,   0,   0,   0,   0, 336,   2,   0,   2,   0,   1],
       [  0,  11,   2,   0,   2,   0,   2, 343,  20,   9,   0,   0],
       [  0,  13,   3,   0,   2,   0,   4,  11, 448,   6,   0,   0],
       [  0,  14,   2,   0,   0,   0,   1,   2,   6, 560,   2,   0],
       [  0,   0,   1,   0,   0,   0,   0,   0,   0,   1, 274,  14],
       [  0,   6,   3,   0,   0,   0,   0,   0,   0,   4,   8, 314]])

LinearSVM is our baseline model with F1 score of 0.953 on test set.

I also looked into if the model is suffering from unbalanced dataset problem. While the dataset is imbalanced it is not affecting the model. I have used hyperparameter `class_weight: balanced`. If it was affecting model's performance i could look into assigning class weight manually or sampling dataset.

### Wordembeddings

I used pretrained embedding - `Word2Vec`. But the vocabulary overlap of word2vec and our corpus is less. Nonetheless, it gave 0.83 __F1-score__ on test data.

### Subembeddings and fastText

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["label"]):
    train_df = df.loc[train_index]
    test_df = df.loc[test_index]

In [None]:
# Lets do some cleaning of this text
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).strip().replace('  ',' ').replace('   ', ' ').replace('    ', ' ').lower()
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, label_prefix='__label__'):
    # Defining the new data
    df = data[['text']].copy(deep=True)
    df['label'] = label_prefix + data['label'].astype(str)
    
    # cleaning it
    if cleanit:
        df['text'] = df['text'].apply(lambda x: clean_it(x))
            
    return df


In [None]:
%%time
# Transform the datasets using the above clean functions
df_train_cleaned = clean_df(train_df, True)
df_test_cleaned = clean_df(test_df, True)

CPU times: user 64 ms, sys: 3.08 ms, total: 67.1 ms
Wall time: 82.4 ms


In [None]:
import csv

df_train_cleaned.to_csv('train.txt', header=None, index=False, sep = ' ', quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")
df_test_cleaned.to_csv('test.txt', header=None, index=False, sep = ' ', quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [None]:
!pip install fasttext==0.9.2

Collecting fasttext==0.9.2
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l[K     |████▊                           | 10 kB 23.7 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 10.2 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 8.2 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 7.5 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 4.3 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 4.5 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 2.7 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.8.1-py2.py3-none-any.whl (208 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3127258 sha256=9b57274e2fe47bda26ede6dec2a6f714da2ead2399de07b282215e38a18cf714
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f76418

In [None]:
import fasttext

In [None]:
%%time
## Using fastText for feature extraction and training
from fasttext.FastText import train_supervised

model = train_supervised(input='train.txt', label="__label__", lr=0.75, ws=21, epoch=200, loss='ova', wordNgrams=2, dim=100, thread=2, verbose=100)

CPU times: user 1min 2s, sys: 525 ms, total: 1min 3s
Wall time: 32.7 s


In [None]:
results = model.test('test.txt')
print(f"Test Samples: {results[0]} Precision@ : {results[1]*100:2.4f} Recall : {results[2]*100:2.4f}")

Test Samples: 4723 Precision@ : 94.8973 Recall : 94.8973


In [None]:
testing_data = pd.read_csv('test_set.csv', encoding='latin-1')
testing_data.head()

Unnamed: 0,text
0,lv tmd pd trip unit for nh parts of circuit br...
1,module tm analog outputs analog output expansi...
2,command group t iii mechanismt p parts forcir...
3,parts of relayelectrical contact issu e f xxup
4,parts for programmable logic controllers dm ...


In [None]:
X = testing_data['text']

In [None]:
X_cleaned = list(clean_data(X))

In [None]:
test_predictions, _ = model.predict(X_cleaned)

In [None]:
encoder.inverse_transform([9])

array([85389000])

In [None]:
test_predictions = [int(str(prediction[0])[-1]) for prediction in test_predictions]

In [None]:
test_labels = encoder.inverse_transform(test_predictions)

In [None]:
test_labels

array([85389000, 85389000, 85389000, ..., 84713010, 84713010, 84713010])

In [None]:
test_labels = pd.Series(test_labels, name='labels')

In [None]:
test_labels.to_csv('test_labels.csv', index=None)

Our corpus has a lot of words which are not found(rare) in pretrained embeddings like Word2Vec and Glove. Fasttext is able to achieve good performance on rare words by making use of character level information and also it solves `<OOV>` problem in a way.

### Using DeepLearning

I started with trying to use glove oretrained embedding, but as discussed earlier it didn't yeild good results due to rare words in our corpus.

In [51]:
max(X_train.apply(lambda x: len(x.split())))

21

In [52]:
MAX_SEQUENCE_LENGTH = 21
MAX_NUM_WORDS = 20000 
EMBEDDING_DIM = 100 
VALIDATION_SPLIT = 0.2

In [53]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)
print("There are %s unique tokens" %len(word_index))

There are 12423 unique tokens


In [54]:
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

trainvalid_data = pad_sequences(train_sequences, MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, MAX_SEQUENCE_LENGTH)
trainvalid_labels = to_categorical(y_train, num_classes=12)
test_labels = to_categorical(y_test, num_classes=12)

# split the training data into a training set and a validation set
indices = np.arange(trainvalid_data.shape[0])
np.random.shuffle(indices)
trainvalid_data = trainvalid_data[indices]
trainvalid_labels = trainvalid_labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * trainvalid_data.shape[0])
x_train = trainvalid_data[:-num_validation_samples]
y_train = trainvalid_labels[:-num_validation_samples]
x_val = trainvalid_data[-num_validation_samples:]
y_val = trainvalid_labels[-num_validation_samples:]
#This is the data we will use for CNN and RNN training
print('Splitting the train data into train and valid is done')

Splitting the train data into train and valid is done


In [57]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

print("Defining and training an LSTM model, training embedding layer on the fly")

#model
rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 100))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(12, activation='sigmoid'))
rnnmodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print('Training the RNN')

early_stopping_cb = keras.callbacks.EarlyStopping(patience=6, restore_best_weights=True)
rnnmodel.fit(x_train, y_train,
          batch_size=32,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks = [early_stopping_cb])
score, acc = rnnmodel.evaluate(test_data, test_labels,
                            batch_size=16)
print('Test accuracy with RNN:', acc)

Defining and training an LSTM model, training embedding layer on the fly
Training the RNN
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Test accuracy with RNN: 0.9227185845375061


So, keras embedding gave 93% accuracy on Test set.
I also could have tried to fasttext embedding which i trained on my data using fasttext. But i didn't have enough time to do it.

I also used BERT pre-trained model which yeild 95% accuracy on test set on 4 epochs using standard hyperparameters. I couldn't tune hyperparameters due to lack of compute resources.