<a href="https://colab.research.google.com/github/bagusamandira/Neural-Network/blob/main/TransferLearning-DistilBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import all packages ###

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
import transformers
from transformers import BertTokenizer, TFBertModel, BertConfig,TFDistilBertModel,DistilBertTokenizer,DistilBertConfig

###  Loading DistilBERT Tokenizer and the DistilBERT model  ###

In [None]:
dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


###  Preprocessing and cleaning functions  ###

In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split()
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words)

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

###  Reading and Cleaning the Dataset  ###

In [None]:
path='/content/Emotion.csv'
df=pd.read_csv(path)

df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
df = df.loc[:, ~df.columns.str.contains('Unnamed: 2', case=False)]
df = df.loc[:, ~df.columns.str.contains('Unnamed: 3', case=False)]
df = df.loc[:, ~df.columns.str.contains('Unnamed: 4', case=False)]
print('File has {} rows and {} columns'.format(df.shape[0],df.shape[1]))
df=df.dropna()
print('File has {} rows and {} columns'.format(df.shape[0],df.shape[1]))
df = shuffle(df)
df=df.reset_index(drop=True)

df.head()

File has 20000 rows and 2 columns
File has 20000 rows and 2 columns


Unnamed: 0,text,label
0,i feel stressed my intention is to remain in c...,anger
1,i have been feeling pretty crappy,sadness
2,i love this service because it is easy to use ...,love
3,i know mom s who would take once look at my fa...,anger
4,i checked on you was a long time ago i can say...,joy


In [None]:
cat_to_num = {
    'sadness': 0,
     'anger': 1,
    'love': 2,
    'surprise': 3,
    'fear': 4,
    'joy': 5
}

df['gt'] = df['label'].map(cat_to_num)

print('Available labels: ',df.label.unique())

num_classes=len(df.label.unique())

df.head()

Available labels:  ['anger' 'sadness' 'love' 'joy' 'fear' 'surprise']


Unnamed: 0,text,label,gt
0,i feel stressed my intention is to remain in c...,anger,1
1,i have been feeling pretty crappy,sadness,0
2,i love this service because it is easy to use ...,love,2
3,i know mom s who would take once look at my fa...,anger,1
4,i checked on you was a long time ago i can say...,joy,5


###  Preparing input for the model  ###

In [None]:
max_len=32
sentences=df['text']
labels=df['gt']
len(sentences),len(labels)

(20000, 20000)

###  Create a basic NN model using DistilBERT embeddings to get the predictions  ###

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model

In [None]:
model=create_model()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 32)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 32)]                 0         []                            
                                                                                                  
 tf_distil_bert_model (TFDi  TFBaseModelOutput(last_hid   6636288   ['input_1[0][0]',             
 stilBertModel)              den_state=(None, 32, 768),   0          'input_2[0][0]']             
                              hidden_states=None, atten                                           
                             tions=None)                                                      

# Tokenize and prep input

In [None]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    dbert_inps=dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids.append(dbert_inps['input_ids'])
    attention_masks.append(dbert_inps['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
labels=np.array(labels)



# Train Test Split

In [None]:
train_inp,temp_inp,train_label,temp_label,train_mask,temp_mask=train_test_split(input_ids,labels,attention_masks,test_size=0.3)
val_inp, test_inp, val_label, test_label, val_mask, test_mask = train_test_split(temp_inp,temp_label,temp_mask,test_size=0.5)

In [None]:
len(input_ids),len(attention_masks),len(labels)

(20000, 20000, 20000)

Compile model

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)

model.compile(loss=loss,optimizer=optimizer, metrics=[metric])

### Training ###

In [None]:
history=model.fit([train_inp,train_mask],train_label,batch_size=16,epochs=10,validation_data=([val_inp,val_mask],val_label))

Epoch 1/10


  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluation matrix

In [None]:
preds = model.predict([test_inp,test_mask],batch_size=16)
pred_labels = preds.argmax(axis=1)
f1 = f1_score(test_label,pred_labels, pos_label='positive', average='micro')
f1





0.90925

In [None]:
target_names=['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
print('F1 score',f1)
print('Classification Report')
print(classification_report(test_label,pred_labels,target_names=target_names))

print('Training and saving built model.....')

F1 score 0.90925
Classification Report
              precision    recall  f1-score   support

     sadness       0.93      0.95      0.94      1170
       anger       0.91      0.92      0.91       539
        love       0.77      0.85      0.81       313
    surprise       0.71      0.90      0.79       138
        fear       0.92      0.83      0.87       479
         joy       0.95      0.92      0.93      1361

    accuracy                           0.91      4000
   macro avg       0.87      0.89      0.88      4000
weighted avg       0.91      0.91      0.91      4000

Training and saving built model.....
