<a href="https://colab.research.google.com/github/YuhaiW/00/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install accelerate -U



In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
import random

# Check if CUDA (GPU support) is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Generate random data
data = []
for _ in range(10000):
    age = random.randint(1, 100)
    blood_pressure = random.randint(60, 180)
    weight = random.randint(30, 150)
    if age > 80 or (blood_pressure > 140 and weight > 120):
        label = 1
    else:
        label = 0
    data.append({"age": age, "blood_pressure": blood_pressure, "weight": weight, "label": label})

# Preprocess the data
texts = ["age " + str(item["age"]) + " blood pressure " + str(item["blood_pressure"]) + " weight " + str(item["weight"]) for item in data]
# converts the generated data into text format
labels = [item["label"] for item in data]
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# DistilBertTokenizer is initialized with the pretrained weights from the 'distilbert-base-uncased' model.
# This is a distilled (smaller, faster) version of BERT that's been trained on lowercase English text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
# truncation=True: This ensures that if a text sequence is longer than the model's maximum input length,
# it will be truncated to fit.
# padding=True: This pads sequences (with special padding tokens) that are shorter
# than the maximum length so that all sequences have the same length.
# This is necessary for batch processing.
# max_length=128: Specifies that each sequence should be truncated or padded to a length of 128 tokens.
# return_tensors="pt": This tells the tokenizer to return the tokenized sequences as PyTorch tensors.

# Convert labels to tensors
train_labels_tensor = torch.tensor(train_labels)
test_labels_tensor = torch.tensor(test_labels)

# Model initialization
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)
# This is a version of the DistilBERT model fine-tuned for classification tasks.
# .from_pretrained('distilbert-base-uncased'): This loads the pretrained weights
# num_labels=2: classification head should be binary,
# Hyperparameters
EPOCHS = 3
BATCH_SIZE = 16
LR = 5e-5
optimizer = optim.AdamW(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    for i in range(0, len(train_texts), BATCH_SIZE):
        optimizer.zero_grad()

        input_ids = train_encodings["input_ids"][i:i+BATCH_SIZE].to(device)
        attention_mask = train_encodings["attention_mask"][i:i+BATCH_SIZE].to(device)
        labels_batch = train_labels_tensor[i:i+BATCH_SIZE].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels_batch)
        loss.backward()
        optimizer.step()

        if i % (10 * BATCH_SIZE) == 0:
            print(f"Epoch: {epoch}, Step: {i//BATCH_SIZE}, Loss: {loss.item()}")

# Evaluation
model.eval()
total, correct = 0, 0
with torch.no_grad():
    for i in range(0, len(test_texts), BATCH_SIZE):
        input_ids = test_encodings["input_ids"][i:i+BATCH_SIZE].to(device)
        attention_mask = test_encodings["attention_mask"][i:i+BATCH_SIZE].to(device)
        labels_batch = test_labels_tensor[i:i+BATCH_SIZE].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)

        total += labels_batch.size(0)
        correct += (predicted == labels_batch).sum().item()

print(f"Accuracy: {100 * correct / total}%")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 0, Step: 0, Loss: 0.6765906810760498
Epoch: 0, Step: 10, Loss: 0.5349210500717163
Epoch: 0, Step: 20, Loss: 0.4223354756832123
Epoch: 0, Step: 30, Loss: 0.5255326628684998
Epoch: 0, Step: 40, Loss: 0.20401522517204285
Epoch: 0, Step: 50, Loss: 0.055156201124191284
Epoch: 0, Step: 60, Loss: 0.2604790925979614
Epoch: 0, Step: 70, Loss: 0.04515562206506729
Epoch: 0, Step: 80, Loss: 0.276427686214447
Epoch: 0, Step: 90, Loss: 0.0654018223285675
Epoch: 0, Step: 100, Loss: 0.28220316767692566
Epoch: 0, Step: 110, Loss: 0.17815475165843964
Epoch: 0, Step: 120, Loss: 0.08412899821996689
Epoch: 0, Step: 130, Loss: 0.07154496014118195
Epoch: 0, Step: 140, Loss: 0.1600005179643631
Epoch: 0, Step: 150, Loss: 0.22450990974903107
Epoch: 0, Step: 160, Loss: 0.036532480269670486
Epoch: 0, Step: 170, Loss: 0.07497560977935791
Epoch: 0, Step: 180, Loss: 0.034460555762052536
Epoch: 0, Step: 190, Loss: 0.003224604297429323
Epoch: 0, Step: 200, Loss: 0.06172981858253479
Epoch: 0, Step: 210, Loss: 0.

In [11]:
!pip install accelerate -U


