<a href="https://colab.research.google.com/github/altair08/FYP/blob/main/finetuned_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine Tuning DistilBERT for Toxic Comment Classification

In [1]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [3]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m121.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.9 MB/s[0m eta [36m0:00:

In [6]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification,DistilBertModel
from transformers import logging

logging.set_verbosity_warning()


## Setting up the device for GPU usage

Followed by that we will preapre the device for CUDA execeution. This configuration is needed if you want to leverage on onboard GPU.

In [8]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('/content/drive/MyDrive/Dats/Kaggle/pre_data_train.csv',encoding = 'latin1')
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

In [9]:
print(f"Total Training Records : {len(train_data)}")
train_data.head()

Total Training Records : 127656


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
140030,ed56f082116dcbd0,Grandma Terri Should Burn in Trash \nGrandma T...,1,0,0,0,0,0,grandma terri burn trash grandma terri trash h...
159124,f8e3cd98b63bf401,", 9 May 2009 (UTC)\nIt would be easiest if you...",0,0,0,0,0,0,may utc would easy admit member involve portug...
60006,a09e1bcf10631f9a,"""\n\nThe Objectivity of this Discussion is dou...",0,0,0,0,0,0,objectivity discussion doubtful nonexistent in...
65432,af0ee0066c607eb8,Shelly Shock\nShelly Shock is. . .( ),0,0,0,0,0,0,shelly shock shelly shock
154979,b734772b1a807e09,I do not care. Refer to Ong Teng Cheong talk p...,0,0,0,0,0,0,care refer ong teng cheong talk page la goutte...


## Removing id column and preparing labels into the single list column

In [10]:
# Drop the "comment_text" and "id" columns
train_data.drop(["comment_text", "id"], axis=1, inplace=True)

# Move the "preprocessed_text" column to the front
columns = train_data.columns.tolist()
columns = ["preprocessed_text"] + [col for col in columns if col != "preprocessed_text"]
train_data = train_data[columns]

# Print the updated dataframe
train_data.head()


Unnamed: 0,preprocessed_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
140030,grandma terri burn trash grandma terri trash h...,1,0,0,0,0,0
159124,may utc would easy admit member involve portug...,0,0,0,0,0,0
60006,objectivity discussion doubtful nonexistent in...,0,0,0,0,0,0
65432,shelly shock shelly shock,0,0,0,0,0,0
154979,care refer ong teng cheong talk page la goutte...,0,0,0,0,0,0


In [11]:
train_data['labels'] = train_data.iloc[:, 1:].values.tolist()
train_data.drop(train_data.columns.values[1:-1].tolist(), inplace=True, axis=1)
train_data.head()

Unnamed: 0,preprocessed_text,labels
140030,grandma terri burn trash grandma terri trash h...,"[1, 0, 0, 0, 0, 0]"
159124,may utc would easy admit member involve portug...,"[0, 0, 0, 0, 0, 0]"
60006,objectivity discussion doubtful nonexistent in...,"[0, 0, 0, 0, 0, 0]"
65432,shelly shock shelly shock,"[0, 0, 0, 0, 0, 0]"
154979,care refer ong teng cheong talk page la goutte...,"[0, 0, 0, 0, 0, 0]"


# Training Parameters <a id='section03'></a>

Defining some key variables that will be used later on in the training


In [12]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 2e-05
NUM_WORKERS = 2

# Preparing the Dataset and Dataloader <a id='section04'></a>
We will start with defining few key variables that will be used later during the training/fine tuning stage.
Followed by creation of MultiLabelDataset class - This defines how the text is pre-processed before sending it to the neural network. We will also define the Dataloader that will feed  the data in batches to the neural network for suitable training and processing.
Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network. For further reading into Dataset and Dataloader read the [docs at PyTorch](https://pytorch.org/docs/stable/data.html)

## *MultiLabelDataset* Dataset Class
- This class is defined to accept the `tokenizer`, `dataframe`, `max_length` and `eval_mode` as input and generate tokenized output and tags that is used by the BERT model for training.
- We are using the DistilBERT tokenizer to tokenize the data in the `text` column of the dataframe.
- The tokenizer uses the `encode_plus` method to perform tokenization and generate the necessary outputs, namely: `ids`, `attention_mask`, `token_type_ids`

- To read further into the tokenizer, [refer to this document](https://huggingface.co/transformers/model_doc/distilbert.html#distilberttokenizer)
- `targets` is the list of categories labled as `0` or `1` in the dataframe.
- The *MultiLabelDataset* class is used to create 2 datasets, for training and for validation.
- *Training Dataset* is used to fine tune the model: **80% of the original data**
- *Validation Dataset* is used to evaluate the performance of the model. The model has not seen this data during training.

## Dataloader
- Dataloader is used to for creating training and validation dataloader that load data to the neural network in a defined manner. This is needed because all the data from the dataset cannot be loaded to the memory at once, hence the amount of dataloaded to the memory and then passed to the neural network needs to be controlled.
- This control is achieved using the parameters such as `batch_size` and `max_len`.
- Training and Validation dataloaders are used in the training and validation part of the flow respectively

In [13]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len: int, eval_mode: bool = False):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.text = dataframe.preprocessed_text
        self.eval_mode = eval_mode
        if self.eval_mode is False:
            self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text.iloc[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        output = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
        }

        if self.eval_mode is False:
            output['targets'] = torch.tensor(self.targets.iloc[index], dtype=torch.float)

        return output

## Loading tokenizer and generating training set

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)
training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Verify the data at index 0

In [15]:
training_set[0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'ids': tensor([  101, 13055, 26568,  6402, 11669, 13055, 26568, 11669,  5223, 13055,
         26568, 14352,  3109,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

## Creating Dataloader

In [16]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': NUM_WORKERS
                }
training_loader = DataLoader(training_set, **train_params)

<a id='section05'></a>
# Neural Network for Fine Tuning


In [17]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistilBERTClass(torch.nn.Module):

    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 6)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


## Loading Neural Network model

In [18]:
model = DistilBERTClass()
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

## Loss Function and Optimizer

In [19]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [20]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

<a id='section06'></a>
# Fine Tuning the Model

In [21]:
def train(epoch):

    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        loss.backward()
        optimizer.step()

In [22]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Epoch: 0, Loss:  0.7387534976005554


101it [00:35,  3.16it/s]

Epoch: 0, Loss:  0.08681915700435638


200it [01:07,  2.99it/s]

Epoch: 0, Loss:  0.12310044467449188


300it [01:40,  3.02it/s]

Epoch: 0, Loss:  0.07798127830028534


401it [02:13,  3.07it/s]

Epoch: 0, Loss:  0.06017611175775528


500it [02:46,  3.00it/s]

Epoch: 0, Loss:  0.08096539974212646


600it [03:19,  3.03it/s]

Epoch: 0, Loss:  0.047065369784832


701it [03:52,  3.06it/s]

Epoch: 0, Loss:  0.03892412409186363


801it [04:25,  3.05it/s]

Epoch: 0, Loss:  0.05112088844180107


901it [04:58,  3.05it/s]

Epoch: 0, Loss:  0.038673318922519684


1001it [05:31,  3.04it/s]

Epoch: 0, Loss:  0.05600122734904289


1100it [06:04,  3.03it/s]

Epoch: 0, Loss:  0.061473436653614044


1201it [06:37,  3.02it/s]

Epoch: 0, Loss:  0.078066885471344


1301it [07:10,  3.05it/s]

Epoch: 0, Loss:  0.049198515713214874


1401it [07:43,  3.04it/s]

Epoch: 0, Loss:  0.03570166975259781


1501it [08:16,  3.05it/s]

Epoch: 0, Loss:  0.0815301388502121


1601it [08:49,  3.05it/s]

Epoch: 0, Loss:  0.032834358513355255


1701it [09:21,  3.06it/s]

Epoch: 0, Loss:  0.05292603373527527


1801it [09:54,  3.03it/s]

Epoch: 0, Loss:  0.05622285604476929


1900it [10:27,  3.03it/s]

Epoch: 0, Loss:  0.034931257367134094


2001it [11:00,  3.03it/s]

Epoch: 0, Loss:  0.028823863714933395


2100it [11:33,  3.03it/s]

Epoch: 0, Loss:  0.03997155278921127


2201it [12:06,  3.04it/s]

Epoch: 0, Loss:  0.010961750522255898


2301it [12:39,  3.03it/s]

Epoch: 0, Loss:  0.05188947543501854


2401it [13:13,  3.03it/s]

Epoch: 0, Loss:  0.05440803989768028


2501it [13:46,  3.05it/s]

Epoch: 0, Loss:  0.055745929479599


2600it [14:18,  3.04it/s]

Epoch: 0, Loss:  0.007078881841152906


2701it [14:52,  3.04it/s]

Epoch: 0, Loss:  0.0760469138622284


2801it [15:24,  3.03it/s]

Epoch: 0, Loss:  0.024454567581415176


2900it [15:57,  3.02it/s]

Epoch: 0, Loss:  0.03236212208867073


3001it [16:30,  3.05it/s]

Epoch: 0, Loss:  0.03601856902241707


3101it [17:03,  3.04it/s]

Epoch: 0, Loss:  0.06433703005313873


3201it [17:36,  3.04it/s]

Epoch: 0, Loss:  0.09096488356590271


3301it [18:09,  3.03it/s]

Epoch: 0, Loss:  0.01226779818534851


3401it [18:42,  3.04it/s]

Epoch: 0, Loss:  0.006079326383769512


3501it [19:15,  3.04it/s]

Epoch: 0, Loss:  0.07331155240535736


3601it [19:48,  3.05it/s]

Epoch: 0, Loss:  0.040099114179611206


3701it [20:21,  3.04it/s]

Epoch: 0, Loss:  0.06767019629478455


3801it [20:53,  3.06it/s]

Epoch: 0, Loss:  0.05117828771471977


3901it [21:26,  3.05it/s]

Epoch: 0, Loss:  0.056276969611644745


3990it [21:55,  3.03it/s]


# Generate Submissions.csv <a id='section07'></a>

In [23]:
test_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
119105,7ca72b5b9c688e9e,"Geez, are you forgetful! We've already discus...",0,0,0,0,0,0,geez forgetful weve already discus marx anarch...
131631,c03f72fd8f8bf54f,Carioca RFA \n\nThanks for your support on my ...,0,0,0,0,0,0,carioca rfa thank support request adminship fi...
125326,9e5b8e8fc1ff2e84,"""\n\n Birthday \n\nNo worries, It's what I do ...",0,0,0,0,0,0,birthday worry enjoy ur daytalke
111256,5332799e706665a6,Pseudoscience category? \n\nI'm assuming that ...,0,0,0,0,0,0,pseudoscience category im assume article pseud...
83590,dfa7d8f0b4366680,"(and if such phrase exists, it would be provid...",0,0,0,0,0,0,phrase exist would provide search engine even ...


In [24]:
# Drop the "comment_text" and "id" columns
test_data.drop(["comment_text","id"], axis=1, inplace=True)

# Move the "preprocessed_text" column to the front
columns = test_data.columns.tolist()
columns = ["preprocessed_text"] + [col for col in columns if col != "preprocessed_text"]
test_data = test_data[columns]

# Print the updated dataframe
test_data.head()

Unnamed: 0,preprocessed_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
119105,geez forgetful weve already discus marx anarch...,0,0,0,0,0,0
131631,carioca rfa thank support request adminship fi...,0,0,0,0,0,0
125326,birthday worry enjoy ur daytalke,0,0,0,0,0,0
111256,pseudoscience category im assume article pseud...,0,0,0,0,0,0
83590,phrase exist would provide search engine even ...,0,0,0,0,0,0


In [25]:
test_data['labels'] = test_data.iloc[:, 1:].values.tolist()
test_data.drop(test_data.columns.values[1:-1].tolist(), inplace=True, axis=1)
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['labels'] = test_data.iloc[:, 1:].values.tolist()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data.drop(test_data.columns.values[1:-1].tolist(), inplace=True, axis=1)


Unnamed: 0,preprocessed_text,labels
119105,geez forgetful weve already discus marx anarch...,"[0, 0, 0, 0, 0, 0]"
131631,carioca rfa thank support request adminship fi...,"[0, 0, 0, 0, 0, 0]"
125326,birthday worry enjoy ur daytalke,"[0, 0, 0, 0, 0, 0]"
111256,pseudoscience category im assume article pseud...,"[0, 0, 0, 0, 0, 0]"
83590,phrase exist would provide search engine even ...,"[0, 0, 0, 0, 0, 0]"


In [26]:
test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, eval_mode = True)
testing_params = {'batch_size': TRAIN_BATCH_SIZE,
               'shuffle': True,
               'num_workers': 2
                }
test_loader = DataLoader(test_set, **testing_params)

In [27]:
all_test_pred = []

def test(epoch):
    model.eval()

    with torch.inference_mode():

        for _, data in tqdm(enumerate(test_loader, 0)):

            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)
            probas = torch.sigmoid(outputs)

            rounded_probas = torch.round(probas)  # Round probabilities to 0 or 1

            all_test_pred.append(rounded_probas)

    return torch.cat(all_test_pred)

In [28]:
probas = test(model)

998it [01:54,  8.70it/s]


In [29]:
from sklearn.metrics import f1_score, accuracy_score

# Convert predictions and true labels to numpy arrays
predicted_labels = probas.cpu().numpy()
true_labels = test_data.labels.values

In [30]:
true_labels

array([list([0, 0, 0, 0, 0, 0]), list([0, 0, 0, 0, 0, 0]),
       list([0, 0, 0, 0, 0, 0]), ..., list([0, 0, 0, 0, 0, 0]),
       list([1, 0, 1, 0, 1, 0]), list([0, 0, 0, 0, 0, 0])], dtype=object)

In [31]:
predicted_labels

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]], dtype=float32)

In [32]:

# Convert the true labels to numpy arrays
true_labels = np.array([np.array(label) for label in true_labels])

# Convert the predicted labels to numpy arrays
predicted_labels = np.round(predicted_labels).astype(int)

# Flatten the true labels and predicted labels arrays
true_labels_flat = true_labels.flatten()
predicted_labels_flat = predicted_labels.flatten()

# Calculate the F1 score
f1 = f1_score(true_labels_flat, predicted_labels_flat, average='micro')*100

# Calculate the accuracy
accuracy = accuracy_score(true_labels_flat, predicted_labels_flat)*100

print("F1 Score:", f1)
print("Accuracy:", accuracy)

F1 Score: 93.12183403833099
Accuracy: 93.12183403833099


In [33]:
model = DistilBERTClass()
model.l1.save_pretrained("/content/drive/MyDrive/finetuned_model")


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
import os

output_dir = os.path.expanduser('/content/drive/MyDrive/finetuned_distilbert')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

tokenizer.save_pretrained(output_dir)
print('Saved')

Saved
