In [9]:
import torch
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
torch.__version__
torch.cuda.is_available()
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name="distilbert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.to(device)

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
train_texts = [
    "This movie is so dangerous for kids, governments should censor the some parts of that. ",
    "the acting of actors is so professional, the movie is the best of 2025",
    "I can not watch that to the end due to horrible scens",
    "maybe the movie can win the oscar , because of it's scenario "
]
#0, negative
#1, positive
train_labels = torch.tensor([0, 1, 0, 1])

In [15]:
encoding = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt")
dataset = TensorDataset(encoding["input_ids"], encoding["attention_mask"], train_labels)
loader = DataLoader(dataset, batch_size=2, shuffle=True)
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 3
model.train()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSelfAttention(
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [19]:
for epoch in range(epochs):
  total_loss = 0
  for batch in loader:
    input_ids, attention_mask, labels = [b.to(device) for b in  batch]
    outputs  = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  avg_loss = total_loss / len(loader)
  print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
input_ids, attention_mask, labels = [b.to(device) for b in  batch]
outputs  = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_loss = total_loss / len(loader)

Epoch 1/3, Loss: 0.5397
Epoch 2/3, Loss: 0.4826
Epoch 3/3, Loss: 0.4599


In [20]:
model.eval()
test_text = "I'm facinated about next episode."
with torch.no_grad():
  test_encoding = tokenizer(test_text, padding=True, truncation=True, return_tensors="pt")
  outputs = model(**test_encoding.to(device))
  print(outputs)
  logits = outputs.logits
  probabilities = torch.softmax(logits, dim=1)
  print(f"Probabilities: {probabilities}")
  predicted_class = torch.argmax(logits, dim=1).item()
  print(f"Predicted Class: {predicted_class}")

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1818,  0.0022]]), hidden_states=None, attentions=None)
Probabilities: tensor([[0.4541, 0.5459]])
Predicted Class: 1


In [24]:
outputs = model(**test_encoding.to(device))
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
predicted_class = torch.argmax(logits, dim=1).item()