In [1]:
!pip install rank_bm25
!pip install nltk

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from rank_bm25 import BM25Okapi
import nltk
import re
def read_csv(file_path):
    return pd.read_csv(file_path)

# Load the main annotated dataset
data = read_csv("/content/Annotated_data.xlsx.csv")

# Check data structure and column names
print(data.head())
print(data.columns)
data['Offense'] = data['Offense'].fillna('').str.strip()
data['Punishment'] = data['Punishment'].fillna('').str.strip()
X_filtered = data['Offense']
y_filtered = data['Punishment']
if len(X_filtered) < 2:
    raise ValueError("Insufficient data for training. Adjust filtering criteria or dataset.")

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
    )
except ValueError as e:
    print(f"Error during train-test split: {e}")
    min_samples = 2
    class_counts = y_filtered.value_counts()
    infrequent_classes = class_counts[class_counts < min_samples].index
    y_filtered = y_filtered.replace(infrequent_classes, "Other")

    X_train, X_test, y_train, y_test = train_test_split(
        X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
    )

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_tfidf, y_train)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'F1-Score': f1_score(y_test, y_pred, average='weighted', zero_division=0),
    }

lr_metrics = evaluate_model(lr_model, X_test_tfidf, y_test)
nb_metrics = evaluate_model(nb_model, X_test_tfidf, y_test)

print("\n--- Offense Classification ---")
print("Logistic Regression Metrics:", lr_metrics)
print("Naive Bayes Metrics:", nb_metrics)

data_filtered_penalty = data[data['Punishment'] != '']

X_penalty_filtered = data_filtered_penalty['Offense']
y_penalty_filtered = data_filtered_penalty['Punishment']

print("Class Distribution in 'Punishment':\n", y_penalty_filtered.value_counts())

try:
    X_train_penalty, X_test_penalty, y_train_penalty, y_test_penalty = train_test_split(
        X_penalty_filtered, y_penalty_filtered, test_size=0.2, random_state=42, stratify=y_penalty_filtered
    )
except ValueError as e:
    print(f"Error during penalty train-test split: {e}")
    X_train_penalty, X_test_penalty, y_train_penalty, y_test_penalty = train_test_split(
        X_penalty_filtered, y_penalty_filtered, test_size=0.1, random_state=42, stratify=None
    )

# Vectorize text using TF-IDF
X_train_penalty_tfidf = tfidf_vectorizer.fit_transform(X_train_penalty)
X_test_penalty_tfidf = tfidf_vectorizer.transform(X_test_penalty)

# Logistic Regression for Penalty Prediction
lr_penalty_model = LogisticRegression(random_state=42)
lr_penalty_model.fit(X_train_penalty_tfidf, y_train_penalty)

# Evaluate Penalty Prediction
penalty_metrics = evaluate_model(lr_penalty_model, X_test_penalty_tfidf, y_test_penalty)
print("\n--- Penalty Prediction ---")
print("Penalty Prediction Metrics (Logistic Regression):", penalty_metrics)

# Prepare corpus for BM25
corpus = data_filtered_penalty['Offense'].tolist()

# Tokenization using regular expressions (alternative to nltk.word_tokenize)
def simple_tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())  # Simple word tokenization

tokenized_corpus = [simple_tokenize(doc) for doc in corpus]

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

# Query Matching
def query_match(query, corpus, bm25_model, top_n=3):
    tokenized_query = simple_tokenize(query)  # Tokenize query using the same method
    scores = bm25_model.get_scores(tokenized_query)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_n]
    return [(corpus[i], scores[i]) for i in top_indices]



            id                                            Offense Punishment  \
0          NaN                                                NaN        NaN   
1   section 35  108. burden of proving that case of accused co...        NaN   
2   section 26                                                NaN        NaN   
3  section 162  11(i) a is accused of a crime. the facts that,...        NaN   
4  section 162                                                NaN        NaN   

  Exceptions Illustration                                   Offense_Keywords  
0        NaN          NaN                                                NaN  
1     except          NaN  burden, proving, case, accused, comes, exceptions  
2        NaN          NaN                                                NaN  
3        NaN          NaN  accused, crime, facts, commission, crime, absc...  
4        NaN          NaN                                                NaN  
Index(['id', 'Offense', 'Punishment', 'Except

In [3]:
!pip install transformers sentence-transformers scikit-learn xgboost joblib pandas tqdm torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [11]:
import pandas as pd
import numpy as np
import torch
import joblib
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from sklearn.feature_extraction.text import TfidfVectorizer

# Load dataset
data = pd.read_csv("/content/Annotated_data.xlsx.csv")

# Fill missing values
data['Offense'] = data['Offense'].fillna('').str.strip()
data['Punishment'] = data['Punishment'].fillna('').str.strip()

# Define input (X) and target (y)
X = data['Offense']
y = data['Punishment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert Text Data to TF-IDF Vectors
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
joblib.dump(rf_model, "random_forest_model.pkl")

from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
joblib.dump(label_encoder, "label_encoder.pkl")

# Train XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train_encoded)
joblib.dump(xgb_model, "xgboost_model.pkl")

# Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test_tfidf)
xgb_metrics = {
    'Accuracy': accuracy_score(y_test_encoded, y_pred_xgb),
    'Precision': precision_score(y_test_encoded, y_pred_xgb, average='weighted', zero_division=0),
    'Recall': recall_score(y_test_encoded, y_pred_xgb, average='weighted', zero_division=0),
    'F1-Score': f1_score(y_test_encoded, y_pred_xgb, average='weighted', zero_division=0),
}
print("\n--- XGBoost Model Performance ---")
print(xgb_metrics)

# BERT Dataset Class
class OffenseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Tokenizer and DataLoader
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_dataset = OffenseDataset(X_train.tolist(), y_train.factorize()[0], tokenizer)
test_dataset = OffenseDataset(X_test.tolist(), y_test.factorize()[0], tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Fine-tune BERT Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(set(y_train)))
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Save trained model and tokenizer
model.save_pretrained("bert_offense_model")
tokenizer.save_pretrained("bert_offense_model")

print("\n--- Training Completed and Model Saved ---")


Parameters: { "use_label_encoder" } are not used.




--- XGBoost Model Performance ---
{'Accuracy': 0.8875, 'Precision': 0.78765625, 'Recall': 0.8875, 'F1-Score': 0.8346026490066226}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,0.752481
2,No log,0.70883
3,No log,0.713746



--- Training Completed and Model Saved ---


In [14]:
import torch
import joblib
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load dataset
data = pd.read_csv("/content/Annotated_data.xlsx.csv")

# Ensure no NaN values in 'Offense' column
data = data.dropna(subset=['Offense'])

corpus = data['Offense'].tolist()

# Compute and save embeddings (Run this only once)
corpus_embeddings = sbert_model.encode(corpus, convert_to_tensor=True)
joblib.dump(corpus_embeddings, "corpus_embeddings.pkl")
joblib.dump(corpus, "corpus.pkl")

print("\n--- Computed and Saved Corpus Embeddings ---")



--- Computed and Saved Corpus Embeddings ---


In [16]:
import torch
import joblib
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load dataset
data = pd.read_csv("/content/Annotated_data.xlsx.csv")

# Drop NaN values and reset index
data = data.dropna(subset=['Offense']).reset_index(drop=True)

# Convert to list
corpus = data['Offense'].tolist()

# Compute and save embeddings
corpus_embeddings = sbert_model.encode(corpus, convert_to_tensor=True)
joblib.dump(corpus_embeddings, "corpus_embeddings.pkl")
joblib.dump(corpus, "corpus.pkl")

print("\n--- Computed and Saved Corpus Embeddings Without NaN ---")



--- Computed and Saved Corpus Embeddings Without NaN ---


In [17]:
import torch
import joblib
from sentence_transformers import SentenceTransformer, util

# Load sentence transformer model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load precomputed corpus and embeddings
corpus = joblib.load("corpus.pkl")
corpus_embeddings = joblib.load("corpus_embeddings.pkl")

print("\n--- Loaded Precomputed Corpus Embeddings ---")

# Semantic Search Function
def semantic_search(query, model, corpus, corpus_embeddings, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)
    return [(corpus[idx], scores[idx].item()) for idx in top_results.indices]

# Example Query
query = "Theft and property crime"
results = semantic_search(query, sbert_model, corpus, corpus_embeddings)

print("\n--- Top Matching Legal Sections ---")
for text, score in results:
    print(f"Score: {score:.4f} | Section: {text}")



--- Loaded Precomputed Corpus Embeddings ---

--- Top Matching Legal Sections ---
Score: 0.5504 | Section: (c) a is accused of receiving stolen property knowing it to have been stolen. it is proposed to prove
Score: 0.4618 | Section: c, accused, receiving, stolen, property, knowing, stolen, proposed, prove
Score: 0.4481 | Section: accused, possession, currency, knew
