#### Objective : Use Hugging Face Transfomer to fine tune a pre trained model(here we use Distil BERT) and apply on our own dataset

In [55]:
import pandas as pd
df=messages = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
df.shape

(5572, 2)

In [57]:
# independent features
X=list(df['message']) # converted into list

In [58]:
y=list(df['label'])

In [59]:
y[:10]

['ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam']

In [60]:
# Convert 'ham' & 'spam' into 0 & 1 , lebel encoding at dependent feature
#pd.get_dummies(y,drop_first=True)
y=list(pd.get_dummies(y,drop_first=True)['spam'])

In [61]:
y[:10]

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]

In [62]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
X_train

In [None]:
y_train

##### Steps:
1. instal Transformer
2. Call the Pre tarined model (Distilbert)
3. Call the tokenizer (Distilbert Tokenizer- distilbert-base-uncased)
4. Convert these encodings into Dataset Objects, for TF it is tensors

In [None]:
# install Transfomers - odel Distill Bert
!pip install transformers

In [66]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [67]:
# Train Test text tokenized , encoding, default pad size 512
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [68]:
train_encodings[:10]

[Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=238

In [69]:
y_train[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

4. onvert these encodings into Dataset Objects, for TF it is tensors

In [70]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [71]:
train_dataset
# this is the format that the seq classification that will be using by DistilBert

<TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(238,), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(238,), dtype=tf.int32, name=None)}, TensorSpec(shape=(), dtype=tf.int32, name=None))>

In [46]:
# TFDistilBertForSequenceClassification -- for sentiment analysis only
# TFTrainingArguments - use for model training
# TFTrainer - after model traiing ,we use this to do prediction
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    eval_steps = 10
)

In [47]:
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset             # evaluation dataset
)

trainer.train()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_transform', 'activation_13', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_39', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [48]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.019357943534851076}

In [49]:
trainer.predict(test_dataset)

PredictionOutput(predictions=array([[ 2.9748192, -3.4620287],
       [-3.22135  ,  3.3555334],
       [ 3.023664 , -3.4481337],
       ...,
       [ 2.4020777, -2.8614504],
       [-3.2092023,  3.3522434],
       [ 2.571391 , -3.058572 ]], dtype=float32), label_ids=array([0, 1, 0, ..., 0, 1, 0], dtype=int32), metrics={'eval_loss': 0.019362197603498187})

In [50]:
trainer.predict(test_dataset)[1].shape

(1115,)

In [53]:
#output=trainer.predict(test_dataset)[1]
import numpy as np
output = trainer.predict(test_dataset)[0]
output = np.argmax(output, axis = - 1) # argmax - to get index of max probability instead of probability values

In [54]:
from sklearn.metrics import confusion_matrix

cm=confusion_matrix(y_test,output)
cm

array([[954,   1],
       [  4, 156]])

In [None]:
trainer.save_model('senti_model')