In [None]:
!pip install sentence-transformers

import pandas as pd
import random
import torch
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import os

# ------------------------------
# Load CSV data
# ------------------------------
# Get the current working directory
current_directory = os.getcwd()
print("Current working directory:", current_directory)

# Construct the relative path to the data file

data_file = os.path.join(current_directory, 'mllegaladvisordb.bns_sectionsnew.csv')
# or use absolute path if the file is in a different location
# data_file = r"C:\Users\Admin\Desktop\mllegaladvisordb.bns_sectionsnew.csv"

# Check if the file exists
if not os.path.exists(data_file):
    raise FileNotFoundError(f"The file '{data_file}' does not exist.")

df = pd.read_csv(data_file)

print("Data sample:")
print(df.head())

# Ensure required columns exist
required_columns = ['BNS_Section', 'IPC_Section', 'Description']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# ------------------------------
# Decide grouping column based on data distribution
# ------------------------------
if df['BNS_Section'].nunique() < df.shape[0] / 2:
    group_column = 'BNS_Section'
else:
    group_column = 'IPC_Section'
print(f"Grouping by: {group_column}")

# ------------------------------
# Prepare data in a simpler way
# ------------------------------
# Collect all descriptions and their groups
descriptions = []
groups = []
for _, row in df.iterrows():
    descriptions.append(row['Description'])
    groups.append(row[group_column])

# ------------------------------
# Load Pre-trained SentenceTransformer Model
# ------------------------------
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ------------------------------
# Manual training loop using triplet loss approach
# ------------------------------
# Training parameters
num_epochs = 3
learning_rate = 2e-5
batch_size = 8  # Smaller batch size to avoid memory issues
margin = 0.5  # Margin for triplet loss

# Setup optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Output path
output_path = 'fine_tuned_model'

print(f"Training on {device} for {num_epochs} epochs...")

for epoch in range(num_epochs):
    # Create triplets for this epoch
    triplets = []

    # Group descriptions by their group
    group_to_descriptions = {}
    for desc, grp in zip(descriptions, groups):
        if grp not in group_to_descriptions:
            group_to_descriptions[grp] = []
        group_to_descriptions[grp].append(desc)

    # Only use groups with at least 2 descriptions
    valid_groups = [g for g, descs in group_to_descriptions.items() if len(descs) >= 2]

    # Create triplets: (anchor, positive, negative)
    # Where positive is from same group, negative from different group
    for _ in range(100):  # Generate a fixed number of triplets per epoch
        # Select a random group that has at least 2 items
        if not valid_groups:
            print("No valid groups found with at least 2 descriptions")
            break

        anchor_group = random.choice(valid_groups)

        # Select anchor and positive from the same group
        anchor, positive = random.sample(group_to_descriptions[anchor_group], 2)

        # Select a different group for negative
        negative_groups = [g for g in valid_groups if g != anchor_group]
        if not negative_groups:
            print("No different groups available for negative samples")
            continue

        negative_group = random.choice(negative_groups)
        negative = random.choice(group_to_descriptions[negative_group])

        triplets.append((anchor, positive, negative))

    if not triplets:
        print("No triplets could be created. Skipping epoch.")
        continue

    # Train on triplets
    model.train()
    train_loss = 0

    # Process in batches
    random.shuffle(triplets)
    num_batches = len(triplets) // batch_size + (1 if len(triplets) % batch_size > 0 else 0)

    progress_bar = tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{num_epochs}")

    for i in progress_bar:
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(triplets))
        batch_triplets = triplets[start_idx:end_idx]

        # Extract anchors, positives, and negatives
        anchors = [t[0] for t in batch_triplets]
        positives = [t[1] for t in batch_triplets]
        negatives = [t[2] for t in batch_triplets]

        # Reset gradients
        optimizer.zero_grad()

        # Get embeddings
        anchor_embeddings = model.encode(anchors, convert_to_tensor=True, device=device)
        positive_embeddings = model.encode(positives, convert_to_tensor=True, device=device)
        negative_embeddings = model.encode(negatives, convert_to_tensor=True, device=device)

        # Calculate distances
        positive_distances = 1 - util.pytorch_cos_sim(anchor_embeddings, positive_embeddings).diagonal()
        negative_distances = 1 - util.pytorch_cos_sim(anchor_embeddings, negative_embeddings).diagonal()

        # Ensure distances require gradients
        positive_distances = positive_distances.requires_grad_()
        negative_distances = negative_distances.requires_grad_()

        # Triplet loss
        losses = torch.relu(positive_distances - negative_distances + margin)
        loss = torch.mean(losses)

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Update stats
        train_loss += loss.item()
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    # Epoch summary
    avg_loss = train_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs} completed. Average loss: {avg_loss:.4f}")

# Save the model
model.save(output_path)
print(f"Model training complete. The fine-tuned model is saved to '{output_path}'.")

Current working directory: /content
Data sample:
                        _id      BNS_Section                 IPC_Section  \
0  67c8bccfdcd2f83aa7d0f449   Sec. 1 (1)-(6)                 S. 1 - S. 5   
1  67c8bccfdcd2f83aa7d0f44a  Sec. 2 (1)-(39)               S. 6 - S. 52A   
2  67c8bccfdcd2f83aa7d0f44b   Sec. 3 (1)-(9)  S. 6, 7, 27, 32, 34, 35-38   
3  67c8bccfdcd2f83aa7d0f44c           Sec. 4                       S. 53   
4  67c8bccfdcd2f83aa7d0f44d           Sec. 5              S. 54 - S. 55A   

                      Chapter  \
0      Chapter I: Preliminary   
1      Chapter I: Preliminary   
2      Chapter I: Preliminary   
3  Chapter II: Of Punishments   
4  Chapter II: Of Punishments   

                                         Description  
0  This section outlines the basic framework of t...  
1  This section provides definitions for key term...  
2  This section offers general explanations for c...  
3  This section details punishments under the BNS...  
4  This section gove

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Training on cpu for 3 epochs...


Epoch 1/3: 100%|██████████| 13/13 [00:08<00:00,  1.60it/s, loss=0.0588]


Epoch 1/3 completed. Average loss: 0.2218


Epoch 2/3: 100%|██████████| 13/13 [00:08<00:00,  1.60it/s, loss=0.2140]


Epoch 2/3 completed. Average loss: 0.2445


Epoch 3/3: 100%|██████████| 13/13 [00:07<00:00,  1.80it/s, loss=0.3114]


Epoch 3/3 completed. Average loss: 0.2340
Model training complete. The fine-tuned model is saved to 'fine_tuned_model'.


In [None]:
from google.colab import files
!zip -r /content/fine_tuned_model.zip /content/fine_tuned_model
files.download('/content/fine_tuned_model.zip')

  adding: content/fine_tuned_model/ (stored 0%)
  adding: content/fine_tuned_model/model.safetensors (deflated 9%)
  adding: content/fine_tuned_model/2_Normalize/ (stored 0%)
  adding: content/fine_tuned_model/config.json (deflated 47%)
  adding: content/fine_tuned_model/modules.json (deflated 62%)
  adding: content/fine_tuned_model/1_Pooling/ (stored 0%)
  adding: content/fine_tuned_model/1_Pooling/config.json (deflated 57%)
  adding: content/fine_tuned_model/config_sentence_transformers.json (deflated 34%)
  adding: content/fine_tuned_model/README.md (deflated 64%)
  adding: content/fine_tuned_model/tokenizer.json (deflated 71%)
  adding: content/fine_tuned_model/sentence_bert_config.json (deflated 4%)
  adding: content/fine_tuned_model/vocab.txt (deflated 53%)
  adding: content/fine_tuned_model/.ipynb_checkpoints/ (stored 0%)
  adding: content/fine_tuned_model/tokenizer_config.json (deflated 73%)
  adding: content/fine_tuned_model/special_tokens_map.json (deflated 80%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# New section