---

## ***Legal Analytics - BAIL Prediction***

#### **TFIDF + IndicBERT Model**

#### *(Without Quantum)*
---

#### *Necessary imports*

In [19]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import os
import json
import re
from tqdm import tqdm
tqdm.pandas()
from evaluate import load
from sklearn.model_selection import train_test_split
import seaborn as sns

#### *Pretrained IndicBERT Tokenizer*

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")

#### *Pretrained IndicBERT model initialization*

In [21]:
# Initialize the model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

#### *Checking for GPU*

In [None]:
# Set device (CUDA or CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

---
### *Loading Dataset*
---

In [23]:
# Load the data
train_df = pd.read_json(r"D:/BP/data/full_data_train.json")
test_df = pd.read_json(r"D:/BP/data/full_data_test.json")

# For hyperparameter search, use a sample of 10% of the data
hp_train_df = train_df.sample(frac=0.1, random_state=42)
hp_test_df = test_df.sample(frac=0.1, random_state=42)


#### *Custom Dataset class*

In [24]:
class LegalDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df.reset_index(drop=True)
        # Extracting text from ranked-sentences
        self.df["text"] = self.df["ranked-sentences"].progress_apply(
            lambda x: " ".join(x[:10]) if isinstance(x, list) else " ".join(eval(x)[:10])
        )
        self.df["label"] = self.df["label"]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Get the text for the current index
        model_input = self.df['text'][idx]
        
        # Tokenize
        encoded_sent = self.tokenizer.encode_plus(
            text=model_input, 
            add_special_tokens=True,       
            max_length=512,                  
            padding='max_length',          
            return_attention_mask=True, 
            truncation=True
        )
        
        input_ids = torch.tensor(encoded_sent.get('input_ids'))
        attention_mask = torch.tensor(encoded_sent.get('attention_mask'))        
        label = torch.tensor(self.df['label'][idx])
        
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}
    
    def print_samples(self, num_samples=2):
        """Prints a few samples from the dataset."""
        for i in range(num_samples):
            sample = self[i]
            print(f"Sample {i + 1}:")
            print(f"Input IDs: {sample['input_ids'][:10]}")  # Print first 10 input IDs
            print(f"Attention Mask: {sample['attention_mask'][:10]}")  # Print first 10 attention mask values
            print(f"Label: {sample['label']}")
            print("-" * 20)  # Separator for clarity

#### *Creating Datasets*

In [None]:
train_dataset = LegalDataset(train_df, tokenizer)
test_dataset = LegalDataset(test_df, tokenizer)
hp_train_dataset = LegalDataset(hp_train_df, tokenizer)
hp_test_dataset = LegalDataset(hp_test_df, tokenizer)

In [None]:
# Print samples from the training dataset
print("Training Dataset Samples:")
train_dataset.print_samples()

---
### *Analysis*
---

In [None]:
sns.countplot(x = "label", hue = "label",data = train_df, palette="Set1")

In [None]:
sns.countplot(x = "label", hue = "label",data = test_df, palette="Set1")

In [None]:
n1 = len(train_df)
n2 = len(test_df)
print("\n")
print("="*50)
print(f"No of Samples in Train Dataset : {n1}")
print(f"No of Samples in Test Dataset  : {n2}")
print(f"Total                          : {n1 + n2}")
print("="*50)

print("\n")
print("="*50)
print("Distribution in Train Dataset")
train_granted, train_dismissed = train_df["label"].value_counts()
print(f"Bail Granted   : {train_granted}")
print(f"Bail Dismissed : {train_dismissed}")
print("="*50)

print("\n")
print("="*50)
print("Distribution in Test Dataset")
test_granted, test_dismissed = test_df["label"].value_counts()
print(f"Bail Granted   : {test_granted}")
print(f"Bail Dismissed : {test_dismissed}")
print("="*50)

print("\n")
print("_"*50)
print(f"Total Granted   : {train_granted + test_granted}")
print(f"Total Dismissed : {train_dismissed + test_dismissed}")
print("_"*50)

---

#### **Shows Class Imbalance**
---

#### *Fixing Metrics*

In [30]:
# Load evaluation metrics
accuracy_metric = load("accuracy")
f1_metric = load("f1")

In [31]:

# Metric computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="micro")
    return {'accuracy': accuracy["accuracy"], 'f1-score': f1["f1"]}


---

### ***Hyperparameter Search***
---

In [32]:
# Hyperparameter search space for Optuna
def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.005, 0.05),
        "adam_beta1": trial.suggest_float("adam_beta1", 0.75, 0.95),
        "adam_beta2": trial.suggest_float("adam_beta2", 0.99, 0.9999),
        "adam_epsilon": trial.suggest_float("adam_epsilon", 1e-9, 1e-7, log=True)
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=os.path.join("TFIDF-INDIC", "output"),
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join("TFIDF-INDIC", "logs"),
    evaluation_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1-score",
)


# Trainer for hyperparameter search
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=hp_train_dataset,
    eval_dataset=hp_test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
# Perform hyperparameter search
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize", hp_space=my_hp_space)

In [None]:
# Print the best hyperparameters
print("Best Hyperparameters:", best_run)

In [None]:
# Garbage collection to free memory after hyperparameter tuning
del trainer
del training_args
import gc
gc.collect()

---
### ***Model Training***
---

In [None]:
# Training with best hyperparameters
print("Starting final training...")

# Reload training arguments
training_args = TrainingArguments(
    output_dir=os.path.join("TFIDF-INDIC", "output"),
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=os.path.join("TFIDF-INDIC", "logs"),
    evaluation_strategy="epoch",
    logging_steps=250,
    save_strategy='epoch',
    learning_rate = 0.00001,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1-score",
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
# Load the best hyperparameters and start training
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

In [None]:
# Train the model
trainer.train()

In [None]:
trainer.save_model(r"D:/BP/model")