# Model and Dataset:
- Model (Distilbert): https://huggingface.co/philschmid/tiny-distilbert-classification
- Dataset (amazon_polarity): https://huggingface.co/datasets/amazon_polarity

Josephine Lo (jlo10)


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd
import re
import unicodedata

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

## Load and Tokenize Data

In [None]:
dataset = load_dataset('amazon_polarity')

In [None]:
def clean_text(text):
    text = re.sub(r'[^\x00-\x7Fa-z\s]+', '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

In [None]:
dataset = dataset.map(lambda example: {'content': clean_text(example['content'])})

Map:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('philschmid/tiny-distilbert-classification')
model = DistilBertForSequenceClassification.from_pretrained('philschmid/tiny-distilbert-classification')

In [None]:
train_encodings = tokenizer(dataset['train']['content'], truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(dataset['test']['content'], truncation=True, padding=True, return_tensors='pt')


train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_data['label']))
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_data['label']))

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


## Training and Evaluation

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = torch.nn.functional.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

    average_loss = running_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss}")


In [None]:
model.eval()
all_predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Testing'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, all_predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(classification_report(true_labels, all_predictions))