In [None]:
#First things first, we want to import our dataset and load it into a variable we can work with. We also want to load the transformer model we're working with,
#which in this case is DistilBERT. 
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

dataset = load_dataset("reuters21578", "ModApte")

In [None]:
#If you visualize the dataset, there's a lot of columns we just do not need. We're going to pare this down to the "text" and "label" column.
from transformers import DistilBertTokenizer
from datasets import DatasetDict

def has_topic(example):
    # Returns True if 'topics' key exists and has at least one topic
    return 'topics' in example and bool(example['topics'])

dataset = dataset.filter(has_topic)

In [None]:
# A lot of entries in the topics column is a list of multiple relevant topics. We're just going to take the first one and call it "label" to simplify things.
def preprocess(examples):
    # examples['topics'] is a list of lists
    examples['label'] = [topic[0] if topic else None for topic in examples['topics']]
    return examples

# Apply the preprocess function to the dataset
dataset = dataset.map(preprocess, batched=True)


In [None]:
columns_to_remove = ['text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title']
dataset = dataset.remove_columns(columns_to_remove)

In [None]:
# Extract all unique labels from the dataset again
unique_labels = sorted(set(label for split in dataset.keys() for label in dataset[split]['label']))

# Create the label_to_id mapping
label_to_id = {label: idx for idx, label in enumerate(sorted(set(dataset["train"]["label"])))}

#Inverse mapping so we can extract the label from the ID outputted by the model; we unfortunately can't leave it in the dataset.
id_to_label = {id: label for label, id in label_to_id.items()}

predicted_label_ids = [0, 2, 1]  # Example list of predicted label IDs from your model
predicted_labels = [id_to_label[label_id] for label_id in predicted_label_ids]


In [None]:
import json

# Saving the label_to_id mapping will make it possible to access predictions from the model seperately after saving!
with open('label_to_id.json', 'w') as f:
    json.dump(label_to_id, f)


In [None]:
# Now predicted_labels will contain the actual labels corresponding to the predicted label IDs
print(predicted_labels)

# Inspect the label_to_id mapping
print(label_to_id)


In [None]:
from datasets import DatasetDict

def label_to_int(example):
    # Convert each label in the batch to its corresponding integer ID
    example['labels'] = [label_to_id.get(label, -1) for label in example['label']]
    return example

# Apply the label_to_int function to all splits in the dataset
dataset = dataset.map(label_to_int, batched = True)

In [None]:
# Now we load DistilBERT and format the dataset in a way that this model can understand (tokenize)

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
# Remove the 'label' column after encoding the labels
tokenized_dataset = tokenized_dataset.remove_columns(['label'])

In [None]:
# Now the dataset will be tokenized and have our text and numericalized labels
import pandas as pd
pd.DataFrame(dataset["train"]).head()

In [None]:
num_labels = len(set(dataset["train"]["labels"]))  # This will inform our model of how many unique labels it needs to train on.
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    num_train_epochs=30,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=2e-5,              # learning rate
    logging_dir='./logs',            # directory for storing logs
)

import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Options are 'micro', 'macro', 'weighted', or None (gives the scores for each class)
    average_type = 'macro'  # or 'macro' or 'weighted' or None
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average=average_type)
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc, # <- mostly interested in this one!
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics  # Pass the compute_metrics function
)


In [None]:
unique_labels = set()
for split in dataset.keys():
    unique_labels.update(dataset[split]['labels'])
print(unique_labels)

import pandas as pd
pd.DataFrame(tokenized_dataset['train']).tail()

#because negative numericalized labels break the model hhaaAHHHH
def filter_negative_labels(example):
    # Returns True if the label is non-negative
    return example['labels'] >= 0

dataset = dataset.filter(filter_negative_labels)
tokenized_dataset = tokenized_dataset.filter(filter_negative_labels)



In [None]:
unique_labels_train = set(tokenized_dataset['train']['labels'])
unique_labels_test = set(tokenized_dataset['test']['labels'])
print("Unique labels in train set:", unique_labels_train)
print("Unique labels in test set:", unique_labels_test)


In [None]:
import pandas
unique_labels = set()
for split in dataset.keys():
    unique_labels.update(dataset[split]['labels'])
print(unique_labels)

pandas.DataFrame(tokenized_dataset['train']).tail()



In [None]:
trainer.train(resume_from_checkpoint = True)

In [None]:
save_path = "./Classification_Model/"

In [None]:
# Save the trained model
trainer.save_model(save_path)
model.save_pretrained(save_path)

In [None]:
tokenizer.save_pretrained(save_path)

In [None]:
trainer.evaluate()

In [None]:
#This is the text we want to test the model on
text="Bubble teas fall under two categories: teas without milk and milk teas. Both varieties come with a choice of black, green, or oolong tea as the base.[1] Milk teas usually include powdered or fresh milk, but may also use condensed milk, almond milk, soy milk, or coconut milk"

In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

modelpath = save_path
tokenizer = AutoTokenizer.from_pretrained(modelpath)
model = AutoModelForSequenceClassification.from_pretrained(modelpath)

# Encode the input by tokenizing it so that it matches the dataset
encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Make prediction
with torch.no_grad():
    logits = model(**encoded_input).logits

# Convert logits to probabilities (softmax) and then to the class label
probabilities = torch.nn.functional.softmax(logits, dim=1)
predicted_class_id = probabilities.argmax().item()

# Ensure you have created the id_to_label dictionary as shown before
predicted_class_label = id_to_label[predicted_class_id]

print(f"Predicted Class ID: {predicted_class_id}, Predicted Class Label: {predicted_class_label}")


Predicted Class ID: 70, Predicted Class Label: tea
