## 1. Mount Drive Data

In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 7.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 31.4MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

## 2. Libraries Import

In [25]:
## Imports
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
import tensorflow as tf ## check for tf_gpu
from transformers import TFDistilBertForSequenceClassification
import numpy as np
from IPython.display import display

## 3. Read Data (Train and Test) + Label Encoding

In [3]:
train = pd.read_csv("drive/MyDrive/colab_work/intent_classification/input/data/v2/train/sofmattress_train.csv")
test = pd.read_csv("drive/MyDrive/colab_work/intent_classification/input/data/v2/test/sofmattress_test.csv")

label_to_no_dict = {};no_to_label_dict = {}
l = train['label'].unique()
for i in range(len(l)):
  label_to_no_dict[l[i]] = i
  no_to_label_dict[i] = l[i]

label_to_no_dict['NO_NODES_DETECTED'] = len(l)
no_to_label_dict[len(l)] = 'NO_NODES_DETECTED'

train['label_value'] = train['label'].map(label_to_no_dict)
test['label_value'] = test['label'].map(label_to_no_dict)

In [4]:
train.tail(5)

Unnamed: 0,sentence,label,label_value
323,May I please know about the offers,OFFERS,20
324,Available offers,OFFERS,20
325,Is offer available,OFFERS,20
326,Want to know the discount,OFFERS,20
327,Tell me about the latest offers,OFFERS,20


In [5]:
test.head()

Unnamed: 0,sentence,label,label_value
0,There are only 2 models,NO_NODES_DETECTED,21
1,Single,NO_NODES_DETECTED,21
2,What's difference between ergo and ortho,COMPARISON,4
3,Return order,RETURN_EXCHANGE,17
4,Hai not recieved my product,DELAY_IN_DELIVERY,15


## 4. Data Preparation

* Train, Val and Test Creation
* Getting train, Test, Val data in right format 
* Initialize DistilBert Tokenizer and get encodings
* Get Train, Val and Test data using TensorFlow Slices

In [7]:
## 4.1 Train, Val and Test Creation -- (Training data size very less, but still lets create validation set)
train_texts, val_texts,train_labels, val_labels = train_test_split(train['sentence'].values,\
                                                                   train['label_value'].values, test_size=0.2,\
                                                                   random_state=42, stratify = train['label_value'].values)
## 4.2 Getting train, Test, Val data in right format 
test_texts = test['sentence'].values
test_labels = test['label_value'].values

train_texts = list(train_texts);val_texts = list(val_texts);test_texts = list(test_texts)
train_labels = list(train_labels);val_labels - list(val_labels); test_labels = list(test_labels)

## 4.3 Initialize DistilBert Tokenizer and get encodings
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

## 4.4  Get Train, Val and Test data using TensorFlow Slices

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




## 5. Model Training

In [58]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels = len(l))
optimizer = tf.keras.optimizers.Adam(learning_rate=6e-5)
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.0004)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics = [metric]) # can also use any keras loss fn
# model.compile(optimizer=optimizer, loss=loss, metrics=[metric]) # can also use any keras loss fn
model.fit(train_dataset.shuffle(1000).batch(16), epochs = 7, batch_size=16)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_transform', 'activation_13', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_119', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7f0b521e3b90>

## 6. Prediction

In [59]:
def fn_prediction(df,model):
  l1=[];l2=[];l3=[]
  for ind, info in df.iterrows():
    predict_input = tokenizer.encode(info['sentence'],
                                  truncation=True,
                                  padding=True,
                                  return_tensors="tf")
    tf_output = model.predict(predict_input)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
    l1.append(no_to_label_dict[np.argmax(tf_prediction)])
    l2.append(np.max(tf_prediction))
    d = {}
    for i in range(len(tf_prediction)):
      d[no_to_label_dict[i]] = tf_prediction[i]
    l3.append(d)
  df['pred_label'] = l1
  df['max_pred_prob'] = l2
  df['all_prob'] = l3
  df['status'] = (df['label']==df['pred_label'])
  print("Value Counts b/w True and Pred Label :\n {}".format((df['label']==df['pred_label']).value_counts()))
  print("Accuracy\n : {}".format(round((df[df['status']==True].shape[0]/df.shape[0])*100,2)))
  print("=======")
  return df

print("Training: ")
train_pred = fn_prediction(df = train, model = model)

print("Validation: ")
df_val = pd.DataFrame({'sentence':val_texts,'label':val_labels})
df_val['label'] = df_val['label'].map(no_to_label_dict)
val_pred = fn_prediction(df = df_val, model = model)

print("Test: ")
test_pred = fn_prediction(df = test, model = model)

Training: 
Value Counts b/w True and Pred Label :
 True     323
False      5
dtype: int64
Accuracy
 : 98.48
Validation: 
Value Counts b/w True and Pred Label :
 True     61
False     5
dtype: int64
Accuracy
 : 92.42
Test: 
Value Counts b/w True and Pred Label :
 False    223
True     174
dtype: int64
Accuracy
 : 43.83


* Model seems highly **overfit**. Need to apply some Regularization Techniques (Dropout, Learning Rate, Epochs)

## 7. Threshold Callibration for class value - 'No_NODES_DETECTED'

In [60]:
display(train_pred[train_pred['status'] == True]['max_pred_prob'].describe())
display(val_pred[val_pred['status'] == True]['max_pred_prob'].describe())

count    323.000000
mean       0.821734
std        0.115923
min        0.328941
25%        0.754382
50%        0.861914
75%        0.910777
max        0.945839
Name: max_pred_prob, dtype: float64

count    61.000000
mean      0.801173
std       0.145571
min       0.328941
25%       0.765798
50%       0.850869
75%       0.905831
max       0.940842
Name: max_pred_prob, dtype: float64

In [65]:
def get_new_status(row):
    if row['max_pred_prob']>=0.45:
        return row['pred_label']
    return 'NO_NODES_DETECTED'


train_pred['pred_new_label'] = train_pred.apply(get_new_status,axis = 1)
val_pred['pred_new_label'] = val_pred.apply(get_new_status,axis = 1)
test_pred['pred_new_label'] = test_pred.apply(get_new_status,axis = 1)

print("Training Accuracy: \n {}".format((train_pred['pred_new_label']==train_pred['label']).value_counts(normalize = True)))
print("=========")
print("Val Accuracy: \n {}".format((val_pred['pred_new_label']==val_pred['label']).value_counts(normalize = True)))
print("=========")
print("Test Accuracy: \n {}".format((test_pred['pred_new_label']==test_pred['label']).value_counts(normalize = True)))

Training Accuracy: 
 True     0.969512
False    0.030488
dtype: float64
Val Accuracy: 
 True     0.878788
False    0.121212
dtype: float64
Test Accuracy: 
 True     0.672544
False    0.327456
dtype: float64


## 8. Saving Results
## To-Do : Model Saving

In [66]:
train_pred.to_csv('drive/MyDrive/colab_work/intent_classification/output/train_res_hug_distilbert_tf_5_epoch.csv',index = False)
val_pred.to_csv('drive/MyDrive/colab_work/intent_classification/output/train_res_hug_distilbert_tf_5_epoch.csv',index = False)
test_pred.to_csv('drive/MyDrive/colab_work/intent_classification/output/train_res_hug_distilbert_tf_5_epoch.csv',index = False)

## 9. Testing

In [67]:
test_sentence = "What about size"
# ,"News anchor hits back at viewer who sent her snarky note about ‘showing too much cleavage’ during broadcast"]
predict_input = tokenizer.encode(test_sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
tf_output = model.predict(predict_input)[0]
# tf_output
tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
np.argmax(tf_prediction),no_to_label_dict[np.argmax(tf_prediction)]

(8, 'WHAT_SIZE_TO_ORDER')

## END