BERT Model Implementation for Classification (Without Strides max length 512 words)

In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('Input.csv')

label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Text'].values, df['Label'].values, test_size=0.2, random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=512)



In [3]:
class LegalCaseDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = LegalCaseDataset(train_encodings, train_labels)
val_dataset = LegalCaseDataset(val_encodings, val_labels)

In [4]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='epoch',
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
trainer.train()

  7%|▋         | 10/153 [01:56<26:49, 11.26s/it]

{'loss': 1.4151, 'grad_norm': 18.000797271728516, 'learning_rate': 4.673202614379085e-05, 'epoch': 0.2}


 13%|█▎        | 20/153 [03:43<23:46, 10.73s/it]

{'loss': 1.2746, 'grad_norm': 9.958771705627441, 'learning_rate': 4.3464052287581704e-05, 'epoch': 0.39}


 20%|█▉        | 30/153 [05:36<22:37, 11.04s/it]

{'loss': 1.2318, 'grad_norm': 9.39039134979248, 'learning_rate': 4.0196078431372555e-05, 'epoch': 0.59}


 26%|██▌       | 40/153 [07:29<21:20, 11.33s/it]

{'loss': 0.9962, 'grad_norm': 8.930109024047852, 'learning_rate': 3.6928104575163405e-05, 'epoch': 0.78}


 33%|███▎      | 50/153 [09:23<19:11, 11.18s/it]

{'loss': 0.8255, 'grad_norm': 6.846794128417969, 'learning_rate': 3.366013071895425e-05, 'epoch': 0.98}


                                                
 33%|███▎      | 51/153 [10:10<16:49,  9.90s/it]

{'eval_loss': 0.6924434900283813, 'eval_runtime': 40.624, 'eval_samples_per_second': 2.511, 'eval_steps_per_second': 0.32, 'epoch': 1.0}


 39%|███▉      | 60/153 [11:55<18:41, 12.06s/it]

{'loss': 0.68, 'grad_norm': 12.667296409606934, 'learning_rate': 3.0392156862745097e-05, 'epoch': 1.18}


 46%|████▌     | 70/153 [13:48<15:57, 11.54s/it]

{'loss': 0.5281, 'grad_norm': 9.976984024047852, 'learning_rate': 2.7124183006535947e-05, 'epoch': 1.37}


 52%|█████▏    | 80/153 [15:45<14:12, 11.67s/it]

{'loss': 0.5142, 'grad_norm': 11.930327415466309, 'learning_rate': 2.38562091503268e-05, 'epoch': 1.57}


 59%|█████▉    | 90/153 [17:48<12:47, 12.18s/it]

{'loss': 0.4499, 'grad_norm': 4.941445827484131, 'learning_rate': 2.058823529411765e-05, 'epoch': 1.76}


 65%|██████▌   | 100/153 [19:50<10:37, 12.03s/it]

{'loss': 0.3844, 'grad_norm': 2.046069860458374, 'learning_rate': 1.7320261437908496e-05, 'epoch': 1.96}


                                                 
 67%|██████▋   | 102/153 [21:02<09:32, 11.22s/it]

{'eval_loss': 0.3684450089931488, 'eval_runtime': 49.109, 'eval_samples_per_second': 2.077, 'eval_steps_per_second': 0.265, 'epoch': 2.0}


 72%|███████▏  | 110/153 [22:39<09:29, 13.25s/it]

{'loss': 0.2503, 'grad_norm': 1.446113109588623, 'learning_rate': 1.4052287581699347e-05, 'epoch': 2.16}


 78%|███████▊  | 120/153 [24:37<06:45, 12.27s/it]

{'loss': 0.3233, 'grad_norm': 4.0440449714660645, 'learning_rate': 1.0784313725490197e-05, 'epoch': 2.35}


 85%|████████▍ | 130/153 [26:42<04:33, 11.91s/it]

{'loss': 0.2488, 'grad_norm': 5.299689292907715, 'learning_rate': 7.5163398692810456e-06, 'epoch': 2.55}


 92%|█████████▏| 140/153 [28:39<02:33, 11.77s/it]

{'loss': 0.2396, 'grad_norm': 8.957122802734375, 'learning_rate': 4.2483660130718954e-06, 'epoch': 2.75}


 98%|█████████▊| 150/153 [30:40<00:35, 11.69s/it]

{'loss': 0.2665, 'grad_norm': 5.912170886993408, 'learning_rate': 9.80392156862745e-07, 'epoch': 2.94}


                                                 
100%|██████████| 153/153 [31:53<00:00, 10.29s/it]

{'eval_loss': 0.36355268955230713, 'eval_runtime': 42.5214, 'eval_samples_per_second': 2.399, 'eval_steps_per_second': 0.306, 'epoch': 3.0}


100%|██████████| 153/153 [31:55<00:00, 12.52s/it]

{'train_runtime': 1915.9096, 'train_samples_per_second': 0.634, 'train_steps_per_second': 0.08, 'train_loss': 0.6323285546957278, 'epoch': 3.0}





TrainOutput(global_step=153, training_loss=0.6323285546957278, metrics={'train_runtime': 1915.9096, 'train_samples_per_second': 0.634, 'train_steps_per_second': 0.08, 'total_flos': 319685672816640.0, 'train_loss': 0.6323285546957278, 'epoch': 3.0})

In [6]:
model.save_pretrained('./fine-tuned-bert512')
tokenizer.save_pretrained('./fine-tuned-bert512')
def classify_case(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**encoding)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    return label_encoder.inverse_transform([predicted_class_id])[0]

example_text = '''A property owner files a lawsuit against their neighbor for encroaching on their land and demands that the boundary be restored to its rightful place.'''
prediction = classify_case(example_text)
print(f"Predicted Case Type: {prediction}")

Predicted Case Type: Civil Case


In [1]:
from transformers import BertTokenizer, BertForSequenceClassification
import joblib
tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert512')
model = BertForSequenceClassification.from_pretrained('./fine-tuned-bert512')
label_encoder = joblib.load('label_encoder.joblib')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def classify_case(text):
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**encoding)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    return label_encoder.inverse_transform([predicted_class_id])[0]

example_text = '''A property owner files a lawsuit against their neighbor for encroaching on their land and demands that the boundary be restored to its rightful place.'''
prediction = classify_case(example_text)
print(f"Predicted Case Type: {prediction}")

Predicted Case Type: Civil Case
