In [4]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import torch
import ast

# Load the augmented datasets
train_data = pd.read_csv('train_augmented.csv')
val_data = pd.read_csv('val_augmented.csv')
test_data = pd.read_csv('test_augmented.csv')

# Function to preprocess the data
def preprocess_data(df):
    df['tokens'] = df['tokens'].apply(ast.literal_eval)
    df['text'] = df['tokens'].apply(lambda x: ' '.join(x))
    df['labels'] = df['sentence_label'].astype(int)
    return df[['text', 'labels']]


In [5]:

# Preprocess the data
train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)
test_data = preprocess_data(test_data)


In [6]:

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:

# Tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Set the format of the datasets
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|██████████| 1000/1000 [00:00<00:00, 7419.78 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 7305.69 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 7776.66 examples/s]


In [8]:

# Define metrics computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results_document_classifier",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs_document_classifier',
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


  5%|▌         | 10/189 [02:40<46:49, 15.70s/it]

{'loss': 1.5096, 'grad_norm': 4.163740158081055, 'learning_rate': 1.8941798941798943e-05, 'epoch': 0.16}


 11%|█         | 20/189 [05:08<41:06, 14.60s/it]

{'loss': 1.3769, 'grad_norm': 9.211262702941895, 'learning_rate': 1.7883597883597884e-05, 'epoch': 0.32}


 16%|█▌        | 30/189 [07:29<37:19, 14.09s/it]

{'loss': 1.3008, 'grad_norm': 6.131967067718506, 'learning_rate': 1.6825396825396828e-05, 'epoch': 0.48}


 21%|██        | 40/189 [09:46<34:40, 13.97s/it]

{'loss': 1.1895, 'grad_norm': 5.6637444496154785, 'learning_rate': 1.576719576719577e-05, 'epoch': 0.63}


 26%|██▋       | 50/189 [12:12<34:33, 14.92s/it]

{'loss': 1.0845, 'grad_norm': 6.66760778427124, 'learning_rate': 1.470899470899471e-05, 'epoch': 0.79}


 32%|███▏      | 60/189 [14:37<31:02, 14.44s/it]

{'loss': 0.9284, 'grad_norm': 5.388881206512451, 'learning_rate': 1.3650793650793652e-05, 'epoch': 0.95}


                                                
 33%|███▎      | 63/189 [16:03<25:54, 12.34s/it]

{'eval_loss': 0.7923546433448792, 'eval_accuracy': 0.825, 'eval_f1': 0.8051698150821563, 'eval_runtime': 49.4381, 'eval_samples_per_second': 4.045, 'eval_steps_per_second': 0.263, 'epoch': 1.0}


 37%|███▋      | 70/189 [17:50<31:29, 15.88s/it]  

{'loss': 0.8978, 'grad_norm': 7.158100605010986, 'learning_rate': 1.2592592592592593e-05, 'epoch': 1.11}


 42%|████▏     | 80/189 [20:07<25:47, 14.20s/it]

{'loss': 0.741, 'grad_norm': 4.783382892608643, 'learning_rate': 1.1534391534391536e-05, 'epoch': 1.27}


 48%|████▊     | 90/189 [22:28<23:33, 14.28s/it]

{'loss': 0.6955, 'grad_norm': 9.361905097961426, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}


 53%|█████▎    | 100/189 [24:59<20:35, 13.88s/it]

{'loss': 0.6468, 'grad_norm': 6.909051895141602, 'learning_rate': 9.417989417989418e-06, 'epoch': 1.59}


 58%|█████▊    | 110/189 [27:29<19:45, 15.01s/it]

{'loss': 0.7308, 'grad_norm': 6.940736770629883, 'learning_rate': 8.35978835978836e-06, 'epoch': 1.75}


 63%|██████▎   | 120/189 [29:58<15:56, 13.86s/it]

{'loss': 0.7015, 'grad_norm': 8.622064590454102, 'learning_rate': 7.301587301587301e-06, 'epoch': 1.9}


                                                 
 67%|██████▋   | 126/189 [32:03<12:29, 11.90s/it]

{'eval_loss': 0.5780775547027588, 'eval_accuracy': 0.83, 'eval_f1': 0.8275103843524896, 'eval_runtime': 47.9906, 'eval_samples_per_second': 4.167, 'eval_steps_per_second': 0.271, 'epoch': 2.0}


 69%|██████▉   | 130/189 [33:03<18:13, 18.53s/it]

{'loss': 0.6487, 'grad_norm': 6.490359783172607, 'learning_rate': 6.243386243386243e-06, 'epoch': 2.06}


 74%|███████▍  | 140/189 [35:23<11:35, 14.19s/it]

{'loss': 0.5786, 'grad_norm': 4.582071304321289, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


 79%|███████▉  | 150/189 [37:38<08:59, 13.83s/it]

{'loss': 0.6172, 'grad_norm': 8.615361213684082, 'learning_rate': 4.126984126984127e-06, 'epoch': 2.38}


 85%|████████▍ | 160/189 [39:57<06:40, 13.81s/it]

{'loss': 0.4323, 'grad_norm': 4.794206619262695, 'learning_rate': 3.068783068783069e-06, 'epoch': 2.54}


 90%|████████▉ | 170/189 [42:29<04:46, 15.10s/it]

{'loss': 0.5038, 'grad_norm': 3.8936448097229004, 'learning_rate': 2.0105820105820108e-06, 'epoch': 2.7}


 95%|█████████▌| 180/189 [44:45<02:01, 13.52s/it]

{'loss': 0.5106, 'grad_norm': 3.1099720001220703, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}


                                                 
100%|██████████| 189/189 [47:32<00:00, 11.71s/it]

{'eval_loss': 0.5309481024742126, 'eval_accuracy': 0.845, 'eval_f1': 0.8370239563567364, 'eval_runtime': 49.9777, 'eval_samples_per_second': 4.002, 'eval_steps_per_second': 0.26, 'epoch': 3.0}


100%|██████████| 189/189 [47:38<00:00, 15.12s/it]


{'train_runtime': 2858.4231, 'train_samples_per_second': 1.05, 'train_steps_per_second': 0.066, 'train_loss': 0.8244055112202963, 'epoch': 3.0}


TrainOutput(global_step=189, training_loss=0.8244055112202963, metrics={'train_runtime': 2858.4231, 'train_samples_per_second': 1.05, 'train_steps_per_second': 0.066, 'total_flos': 197338606848000.0, 'train_loss': 0.8244055112202963, 'epoch': 3.0})

In [9]:

# Evaluate on the test set
test_results = trainer.evaluate(tokenized_test)

print("Test results:", test_results)

# Save the model
trainer.save_model("./document_classifier_model")
print("Model saved to ./document_classifier_model")

100%|██████████| 13/13 [00:47<00:00,  3.69s/it]


Test results: {'eval_loss': 0.6955267190933228, 'eval_accuracy': 0.755, 'eval_f1': 0.7386042154566745, 'eval_runtime': 52.0812, 'eval_samples_per_second': 3.84, 'eval_steps_per_second': 0.25, 'epoch': 3.0}
Model saved to ./document_classifier_model
