# Fine Tuning DistilBERT for Multiclass Text Classification

## Model - 'distilbert-base-uncased'


In [1]:
!pip install -r packs.txt



In [2]:
import transformers

In [3]:
print(transformers.__version__)

4.22.1


In [4]:
from transformers import DistilBertTokenizer
import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
# from plotly.offline import iplot

from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import os
import requests
from requests.auth import HTTPBasicAuth

def downloadFileFromRepo(username, repository, branch, filepath, token):
    # Construct the URL to download the file from GitHub
    url = f"https://raw.githubusercontent.com/{username}/{repository}/{branch}/{filepath}"

    # Send a GET request to download the file
    response = requests.get(url, auth=HTTPBasicAuth(username, token))

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Extract the file name from the URL
        fileName = filepath.split('/')[-1]

        # Create the 'data' directory if it doesn't exist
        if not os.path.exists('data'):
            os.makedirs('data')

        # Define the file path within the 'data' directory
        localFilepath = os.path.join('data', fileName)

        # Write the file content to a local file
        with open(localFilepath, 'wb') as f:
            f.write(response.content)
        print(f"File '{fileName}' downloaded successfully.")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

username = ""
repository = ""
branch = ""
path_to_file = ""
repoToken = ""

downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/data/trainLemmatized.csv"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

path_to_file = "cw2/tokenizer.json"
downloadFileFromRepo(username, repository, branch, path_to_file, repoToken)

File 'trainPreprocessed.csv' downloaded successfully.
File 'trainLemmatized.csv' downloaded successfully.
File 'tokenizer.json' downloaded successfully.


In [6]:
trainBatchSize = 2056

dataset = tf.data.experimental.make_csv_dataset("./data/trainLemmatized.csv",
                                                batch_size = trainBatchSize,
                                                select_columns = ["data", "labels"],
                                                label_name = "labels",
                                                num_epochs = 1,
                                                shuffle = True)

In [7]:
from transformers import TFDistilBertForSequenceClassification, TFTrainingArguments, TFTrainer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

training_args = TFTrainingArguments(
    output_dir='/results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=1e-5,
    # logging_dir='./logs',
    eval_steps=100
)

with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 5 )


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that

In [8]:
%%time

# Iterator to avoid loading the entire dataset
iterator = iter(dataset)
# To keep track of which batch we're operating on
progress = 0
# Folds for cross-validation
kSplits = 10

# For evaluation later
losses = []
accuracies = []

try:
  while True:
    # Obtain batch of text as a list
    data = next(iterator)
    data_texts = [text_tensor.numpy() for text_tensor in data[0].values()][0]
    data_texts = [sample.decode() for sample in data_texts]
    data_labels = data[1].numpy()
    data_labels = data_labels - 1

    train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state = 0 )

    # Convert to matrix of binaries (1 if the word occurs, 0 otherwise)
    train_encodings = tokenizer(train_texts, truncation = True, padding = True, max_length = 512, return_tensors = 'tf')

    val_encodings = tokenizer(val_texts, truncation = True, padding = True, max_length = 512, return_tensors = 'tf')

    train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      train_labels
      ))

    val_dataset = tf.data.Dataset.from_tensor_slices((
      dict(val_encodings),
      val_labels
      ))

    trainer = TFTrainer(
      model=trainer_model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      )

    trainer.train()

    trainer.evaluate()

    # Admin stuff
    progress = progress + 1
    clear_output(wait = True)
    print(f"Batch number: {progress}")

except StopIteration:
  print("End of iterator reached.")




KeyboardInterrupt: 

# Saving & Loading the model

In [9]:
save_directory = "/saved_models"

trainer_model.save_pretrained(save_directory)

tokenizer.save_pretrained(save_directory)

('/saved_models/tokenizer_config.json',
 '/saved_models/special_tokens_map.json',
 '/saved_models/vocab.txt',
 '/saved_models/added_tokens.json')

# Loading Pre-Trained Model

In [None]:
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)

model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

In [None]:
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = 'tf'
)

output = model_fine_tuned(predict_input)[0]

prediction_value = tf.argmax(output, axis = 1).numpy()[0]

prediction_value