In [1]:
!pip install torch
!pip install transformers
!pip install accelerate
!pip install torchtext
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading s

In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, DistilBertForSequenceClassification
from torch.nn import KLDivLoss
import torch.optim as optim
import time
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load IMDB dataset using pandas, ignoring errors
try:
    df = pd.read_csv('IMDB Dataset.csv', error_bad_lines=False)
except pd.errors.ParserError:
    df = pd.read_csv('IMDB Dataset.csv', engine='python', error_bad_lines=False)

# Split into train and test datasets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize and preprocess the data
def preprocess(df):
    inputs = tokenizer(df['review'].tolist(), truncation=True, padding='max_length', max_length=512, return_tensors='pt')
    labels = torch.tensor((df['sentiment'] == 'positive').astype(int).tolist())
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask']}, labels

train_data = preprocess(train_df)
test_data = preprocess(test_df)

# Define models
teacher_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
student_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Set up data loaders
train_loader = torch.utils.data.DataLoader(list(zip(train_data[0]['input_ids'], train_data[0]['attention_mask'], train_data[1])), batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(list(zip(test_data[0]['input_ids'], test_data[0]['attention_mask'], test_data[1])), batch_size=16, shuffle=False)

# Define distillation function
def distill(student, teacher, train_loader, optimizer, loss_fct):
    # Training loop
    start_time = time.time()

    for epoch in range(3):
        student.train()
        teacher.eval()

        for batch in train_loader:
            optimizer.zero_grad()

            # Forward pass through the teacher model
            with torch.no_grad():
                teacher_logits = teacher(batch[0].to(device), attention_mask=batch[1].to(device)).logits

            # Forward pass through the student model
            student_logits = student(batch[0].to(device), attention_mask=batch[1].to(device)).logits

            # Compute the loss
            loss = loss_fct(
                torch.log_softmax(student_logits, dim=-1),
                torch.softmax(teacher_logits, dim=-1)
            )

            # Backward pass and optimization step
            loss.backward()
            optimizer.step()

    end_time = time.time()
    training_time = end_time - start_time
    print(f"Finished Training, time taken: {training_time:.2f}s")

# Define evaluation function
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in test_loader:
            inputs = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(inputs, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Accuracy of the network on the test data: {100 * correct / total}%")

def main():
    # Initialize models
    teacher_model.to(device)
    student_model.to(device)

    # Define optimizer and loss function
    optimizer = optim.Adam(student_model.parameters(), lr=2e-5)
    loss_fct = KLDivLoss(reduction='batchmean')

    # Perform distillation
    distill(student_model, teacher_model, train_loader, optimizer, loss_fct)

    # Evaluate the student model
    evaluate(student_model, test_loader)

if __name__ == "__main__":
    main()


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



  df = pd.read_csv('IMDB Dataset.csv', error_bad_lines=False)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.

Finished Training, time taken: 9730.48s
Accuracy of the network on the test data: 50.18%
