In [12]:
import pandas as pd
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import wandb
import torch

In [None]:
wandb.init(project="Mlops-classification", entity="yunchiz-new-york-university")

In [13]:
data = pd.read_csv("./dataset/NewsCategorizer.csv")
train_texts, test_texts, train_labels, test_labels = train_test_split(data['short_description'], data['category'], test_size=0.2, shuffle=True)

In [18]:
data['short_description'].apply(lambda x: len(x.split())).mean()

np.float64(22.9839)

In [3]:
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', cache_dir='./model')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(data['category'].unique()), cache_dir='./model')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True)

In [7]:
train_labels_tensor = torch.tensor(train_labels_encoded)
test_labels_tensor = torch.tensor(test_labels_encoded)

In [8]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
train_dataset = NewsDataset(train_encodings, train_labels_tensor)
test_dataset = NewsDataset(test_encodings, test_labels_tensor)

In [10]:
training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=3, 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=16,
    warmup_steps=500, 
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

In [30]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=test_dataset
)

trainer.train()

Step,Training Loss
10,2.3221


KeyboardInterrupt: 

In [None]:
wandb.finish()

In [32]:
results = trainer.evaluate()

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x3869709d0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 38ce2afd0, raw_cell="results = trainer.evaluate()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/yunchizhao/NYU/2025%20Spring/9183%20MLSys/project/ECE-GY-9183-Project/classification/classification.ipynb#X13sZmlsZQ%3D%3D>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

KeyboardInterrupt: 

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x3869709d0>> (for post_run_cell), with arguments args (<ExecutionResult object at 38cf7b2d0, execution_count=32 error_before_exec=None error_in_exec= info=<ExecutionInfo object at 38ce2afd0, raw_cell="results = trainer.evaluate()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell:/Users/yunchizhao/NYU/2025%20Spring/9183%20MLSys/project/ECE-GY-9183-Project/classification/classification.ipynb#X13sZmlsZQ%3D%3D> result=None>,),kwargs {}:


BrokenPipeError: [Errno 32] Broken pipe

In [11]:
def classify(text):
    inputs = tokenizer([text], max_length=512, truncation=True, padding=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predicted_class = logits.argmax(dim=-1).item()
    
    return predicted_class

text = """New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other."""

predicted_class = classify(text)
print(f"Predicted class: {predicted_class}")

Predicted class: 8
