In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import Dataset, DatasetDict
import numpy as np
import torch
from torchinfo import summary

In [2]:
# 1. Read the CSV file
df = pd.read_csv("Final-Project/data/train.csv")

In [3]:
# 2. Split the data (80% training, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Training set size:", len(train_df))
print("Test set size:", len(test_df))

Training set size: 6090
Test set size: 1523


In [4]:
# 3. Convert Pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
datasets = DatasetDict({"train": train_dataset, "test": test_dataset})

In [5]:
# 4. Initialize the tokenizer and define the tokenization function
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
def tokenize_function(examples):
    # Ensure the key "text" matches your CSV column name
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=280)

In [7]:
# 5. Tokenize both the training and test datasets
tokenized_datasets = datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [8]:
# 6. Load the model; assuming a binary classification (num_labels=2)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Using device:", device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda:0


In [9]:
# 7. Display the model summary
summary(model)

Layer (type:depth-idx)                                       Param #
BertForSequenceClassification                                --
├─BertModel: 1-1                                             --
│    └─BertEmbeddings: 2-1                                   --
│    │    └─Embedding: 3-1                                   23,440,896
│    │    └─Embedding: 3-2                                   393,216
│    │    └─Embedding: 3-3                                   1,536
│    │    └─LayerNorm: 3-4                                   1,536
│    │    └─Dropout: 3-5                                     --
│    └─BertEncoder: 2-2                                      --
│    │    └─ModuleList: 3-6                                  85,054,464
│    └─BertPooler: 2-3                                       --
│    │    └─Linear: 3-7                                      590,592
│    │    └─Tanh: 3-8                                        --
├─Dropout: 1-2                                               --
├─L

In [10]:
# 8. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=50,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    label_names=["labels"] 
)

In [11]:
# 9. Define a compute_metrics function to calculate accuracy
def compute_metrics(logits, labels):
    accuracy = np.mean(logits == labels)
    return {"accuracy": accuracy}

In [12]:
# 10. Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    #compute_metrics=compute_metrics,
)

In [13]:
# 11. Train the model
trainer.train()



Step,Training Loss
10,0.6611
20,0.517
30,0.5201
40,0.4237
50,0.5002
60,0.4357
70,0.4376
80,0.4501
90,0.436
100,0.3985




TrainOutput(global_step=9550, training_loss=0.03889988115896332, metrics={'train_runtime': 2207.8936, 'train_samples_per_second': 137.914, 'train_steps_per_second': 4.325, 'total_flos': 4.38141573828e+16, 'train_loss': 0.03889988115896332, 'epoch': 50.0})

In [14]:
# 12. Evaluate on the test set by generating predictions
test_output = trainer.predict(tokenized_datasets["test"])



In [15]:
# Extract the predictions and compute accuracy
predictions = test_output.predictions
predicted_labels = np.argmax(predictions, axis=1)

In [16]:
# Access accuracy from the computed metrics (the key is defined by compute_metrics)
true_labels = np.array(tokenized_datasets["test"]["labels"])
accuracy = compute_metrics(predicted_labels, true_labels)
print("Test Accuracy:", accuracy)

Test Accuracy: {'accuracy': 0.8240315167432699}
