# Implementing custom model with dataset and train it for sentiment analysis

In [11]:
!pip install transformers datasets torch
!pip install --upgrade torch torchvision torchaudio


Collecting torch
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision
  Downloading torchvision-0.19.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting torchaudio
  Downloading torchaudio-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting triton==3.0.0 (from torch)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl (797.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.2/797.2 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25

In [12]:
#load and prepare the data

from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset("imdb")
print(dataset.__class__)#dictonary
print(dataset.column_names)

# Split the dataset into training and test sets
train_dataset = dataset['train']
test_dataset = dataset['test']

print(train_dataset)


<class 'datasets.dataset_dict.DatasetDict'>
{'train': ['text', 'label'], 'test': ['text', 'label'], 'unsupervised': ['text', 'label']}
Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [13]:
#preprocessing the data

from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


RuntimeError: THPDtypeType.tp_dict == nullptr INTERNAL ASSERT FAILED at "../torch/csrc/Dtype.cpp":176, please report a bug to PyTorch. 

In [None]:
# Prepare the Data for Training

from torch.utils.data import DataLoader

# Convert datasets to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Create data loaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)


In [None]:
# create the model

from transformers import BertForSequenceClassification

# Load the BERT model with a classification head
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#  Train the Model

from transformers import AdamW
from torch.optim.lr_scheduler import StepLR
import torch

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=0.01, no_deprecation_warning=True)

# Set up the learning rate scheduler
scheduler = StepLR(optimizer, step_size=1, gamma=0.5)

# Set up the device (GPU/CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(3):  # Train for 3 epochs
    for batch in train_dataloader:
        # Move data to the device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Separate labels from input data
        labels = batch.pop('label')

        # Forward pass
        outputs = model(**batch, labels=labels) # Pass labels to the model separately
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Update learning rate
    scheduler.step()

    print(f"Epoch {epoch + 1} completed. Loss: {loss.item()}")

KeyboardInterrupt: 

In [None]:
# Evaluate the Model

from sklearn.metrics import accuracy_score

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        predictions.append(torch.argmax(logits, dim=-1).cpu().numpy())
        true_labels.append(batch["labels"].cpu().numpy())

# Flatten lists and calculate accuracy
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy:.4f}")



NameError: name 'model' is not defined