<a href="https://colab.research.google.com/github/nots-2022-g1/nots-agls/blob/main/multiclass_text_classification_with_BERT_angular_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# INSTALL DEPENDENCIES
# !pip install -q -U "tensorflow-text==2.8.*"
!pip install -q tf-models-official==2.7.0
!pip install tensorflow_text



In [None]:
# IMPORT PACKAGES
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from official.nlp import optimization  # to create AdamW optimizer
from keras import backend as K

np.set_printoptions(precision=3, suppress=True) # Make numpy values easier to read.
tf.get_logger().setLevel('ERROR')

In [None]:
# LOAD DATA
loaded_data_df = pd.read_csv('angular-test-dataset.csv')

# Print first few data entries
loaded_data_df.head()

Unnamed: 0,learindex,git_commit_message,classification
0,0,compiler fix deprecation warning jasmine log a...,test
1,1,docsinfra upgrade cli command doc source to aa...,build
2,2,form property renaming safe code this fix prop...,fix
3,3,compiler specify angularcore a peer dependency...,refactor
4,4,core support typescript add support for typesc...,feat


In [None]:
# PREPROCESS LOADED DATA
# Remove unwanted columns
df = pd.DataFrame()
df["label"] = loaded_data_df["classification"]
df["text"] = loaded_data_df["git_commit_message"]

# Convert the category labels to numbered labels
encode_dict = {}

def encode_label(x):
    if x not in encode_dict.keys():
      encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['encoded_label'] = df['label'].apply(lambda x: encode_label(x))

# Drop unused label column
# df = df.drop(["label"], axis=1)

df.head()

Unnamed: 0,label,text,encoded_label
0,test,compiler fix deprecation warning jasmine log a...,0
1,build,docsinfra upgrade cli command doc source to aa...,1
2,fix,form property renaming safe code this fix prop...,2
3,refactor,compiler specify angularcore a peer dependency...,3
4,feat,core support typescript add support for typesc...,4


In [None]:
# CHECK DATA
num_classes=len(df['encoded_label'].value_counts())
df['label'].value_counts()

fix         5463
refactor    3002
feat        2552
build       1897
docs        1671
test        1054
chore        877
perf         335
ci           190
style        144
cleanup       59
doc           40
feature       13
example        8
release        7
Name: label, dtype: int64

In [None]:
# SPLIT DATASET IN TRAIN AND TEST SET
from sklearn.model_selection import train_test_split

y = tf.keras.utils.to_categorical(df["encoded_label"].values, num_classes=num_classes)

x_train, x_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.20)

x_train.count()

13852

In [None]:
# LOAD MODEL
preprocessor = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-preprocess/2")
encoder = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1")

def get_embeddings(sentences):
  '''return BERT-like embeddings of input text
  Args:
    - sentences: list of strings
  Output:
    - BERT-like embeddings: tf.Tensor of shape=(len(sentences), 768)
  '''
  preprocessed_text = preprocessor(sentences)
  return encoder(preprocessed_text)['pooled_output']


get_embeddings([
    "Questa collezione di Haiku è una porta aperta sulla cultura giapponese."]
)

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[ 0.108, -0.092, -0.32 , -0.017,  0.334,  0.2  , -0.247, -0.543,
        -0.357, -0.298, -0.209,  0.142,  0.027, -0.434, -0.574, -0.883,
         0.237, -0.187, -0.257, -0.247,  0.632,  0.121, -0.396,  0.158,
        -0.066, -1.009,  0.303,  0.024, -0.098, -0.637,  0.115, -0.763,
        -0.544, -0.467, -0.624,  0.264, -0.441,  0.066, -0.389,  0.016,
         0.116, -0.435, -0.513,  0.275, -0.344, -0.188,  0.015, -0.324,
         0.303,  0.384,  0.373,  1.057,  0.265, -0.278,  0.06 ,  0.095,
         0.015, -0.305,  0.257, -0.221, -0.32 , -0.433, -0.184, -0.562,
        -0.361, -0.013, -0.227, -0.416, -0.153, -0.475, -0.823, -0.781,
         0.114, -0.397, -0.161, -0.015, -0.385, -0.142, -0.567, -0.197,
         0.093,  3.413, -0.441, -0.375,  0.306,  0.093, -0.175, -0.191,
         0.208,  0.157, -0.296,  0.534, -0.493, -0.61 ,  0.072, -0.236,
        -0.436,  0.285, -0.113, -0.087,  0.106, -0.278,  0.041, -0.087,
         0.658

In [None]:


def balanced_recall(y_true, y_pred):
    """This function calculates the balanced recall metric
    recall = TP / (TP + FN)
    """
    recall_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true_class, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        recall_by_class = recall_by_class + recall
    return recall_by_class / y_pred.shape[1]

def balanced_precision(y_true, y_pred):
    """This function calculates the balanced precision metric
    precision = TP / (TP + FP)
    """
    precision_by_class = 0
    # iterate over each predicted class to get class-specific metric
    for i in range(y_pred.shape[1]):
        y_pred_class = y_pred[:, i]
        y_true_class = y_true[:, i]
        true_positives = K.sum(K.round(K.clip(y_true_class * y_pred_class, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred_class, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        precision_by_class = precision_by_class + precision
    # return average balanced metric for each class
    return precision_by_class / y_pred.shape[1]

def balanced_f1_score(y_true, y_pred):
    """This function calculates the F1 score metric"""
    precision = balanced_precision(y_true, y_pred)
    recall = balanced_recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [None]:
# DEFINE THE MODEL

i = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
x = preprocessor(i)
x = encoder(x)
x = tf.keras.layers.Dropout(0.2, name="dropout")(x['pooled_output'])
x = tf.keras.layers.Dense(num_classes, activation='softmax', name="output")(x)

model = tf.keras.Model(i, x)

In [None]:
# TRAIN THE MODEL
epochs = 20

METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name="accuracy"),
      balanced_recall,
      balanced_precision,
      balanced_f1_score
]

model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = METRICS)

model_fit = model.fit(x_train, 
                      y_train, 
                      epochs = epochs,
                      validation_data = (x_test, y_test)
                      )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
