In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Step-0: Importing Packages**

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import warnings
import torch
warnings.filterwarnings('ignore')

In [None]:
torch.cuda.is_available()

False

In [None]:
torch.cuda.device_count()

0

In [None]:
# torch.cuda.get_device_name(0)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/dataset.csv")
df.head(5)

Unnamed: 0,input,expected_output
0,ujjagar singh the appellant herein a resident ...,"FACTS\nujjagar singh, the appellant herein, wa..."
1,this appeal with special leave is directed aga...,FACTS\nthis appeal with special leave is direc...
2,interpretation and or application of the provi...,FACTS\nthe government of gujarat in exercise o...
3,the state of manipur is in appeal before us qu...,FACTS\none shri a.j.tayeng was the revenue com...
4,these two appeals involve identical questions ...,FACTS\nthe appellants question correctness of ...


## **Step-01: Trimming the dataset**

In [None]:
df.drop(df.index[25:49],axis = 0, inplace = True)

In [None]:
df.shape

(26, 2)

## **Step-02: Text Preprocessing**

In [None]:
#Converting to lower case
df['input'] = df['input'].apply(lambda x: x.lower())
df['expected_output'] = df['expected_output'].apply(lambda x: x.lower())

In [None]:
#Replacing punctuations with space
import re
df['input'] = df['input'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
df.head(5)

Unnamed: 0,input,expected_output
0,ujjagar singh the appellant herein a resident ...,"facts\nujjagar singh, the appellant herein, wa..."
1,this appeal with special leave is directed aga...,facts\nthis appeal with special leave is direc...
2,interpretation and or application of the provi...,facts\nthe government of gujarat in exercise o...
3,the state of manipur is in appeal before us qu...,facts\none shri a.j.tayeng was the revenue com...
4,these two appeals involve identical questions ...,facts\nthe appellants question correctness of ...


In [None]:
#Removal of special characters with space(including \n)
df['expected_output'] = df['expected_output'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
df.head(5)

Unnamed: 0,input,expected_output
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...


## **Step-03: Tokenization**

In [None]:
# Tokenizing the input values
df['input_tokens'] = df['input'].apply(lambda x: word_tokenize(x))

# Tokenizing the expected summary values
df['summary_tokens'] = df['expected_output'].apply(lambda x: word_tokenize(x))
df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,"[ujjagar, singh, the, appellant, herein, a, re...","[facts, ujjagar, singh, the, appellant, herein..."
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,"[this, appeal, with, special, leave, is, direc...","[facts, this, appeal, with, special, leave, is..."
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,"[interpretation, and, or, application, of, the...","[facts, the, government, of, gujarat, in, exer..."
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,"[the, state, of, manipur, is, in, appeal, befo...","[facts, one, shri, a, j, tayeng, was, the, rev..."
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,"[these, two, appeals, involve, identical, ques...","[facts, the, appellants, question, correctness..."


## **Step-04: Stopword Removal**


In [None]:
stopwords_list = set(stopwords.words('english'))
print("Original Length of Stopwords List:",len(stopwords_list))
print(stopwords_list)

Original Length of Stopwords List: 179
{'each', 'does', 'being', 'with', 's', 'most', "shan't", 'she', 'more', 'can', 'him', "couldn't", 'were', "you've", "should've", 'further', 'they', "mightn't", 'isn', 'the', 'itself', 'we', 'ours', 'be', 'aren', 'myself', 'is', 'did', 'mustn', 'against', 'when', 'own', 'after', 'himself', 'than', 'here', 'such', 'o', 'couldn', "aren't", 'about', 'mightn', "isn't", "won't", 'haven', 'its', 'are', 'where', 'few', 'have', 'them', 'will', 'whom', 'ourselves', 'there', 'off', 'once', 'who', 'nor', 'only', 'so', 'no', 'by', "didn't", "don't", 'not', 'been', 'on', "hadn't", 'weren', 'was', 'yours', 'into', 'before', "wouldn't", 'it', 'through', "you're", 'any', 'don', 'then', 'd', 'same', 'didn', 'yourselves', 'or', 'to', 'while', "doesn't", 'theirs', 'over', 'why', "she's", 'am', "you'd", 'and', 'wouldn', 'should', 'because', 'my', 'hadn', 'if', "wasn't", 'herself', 'needn', 'ain', 'me', 'how', 'their', 'had', 'themselves', "needn't", 'm', 'these', 'out

In [None]:
# Trimming down the stopwords list
exclude_words = ["not","don't", 'should', "should've", "mightn't", 'mustn', "mustn't",'shouldn',
                "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn', "wouldn't",
                'when', 'where', 'why', 'how',"couldn't","didn't","doesn't","hadn't","haven't",]
stopwords_list = list(filter(lambda x: x not in exclude_words,stopwords_list))
print("New Length of Stopwords List:",len(stopwords_list))
print(stopwords_list)

New Length of Stopwords List: 154
['each', 'does', 'being', 'with', 's', 'most', "shan't", 'she', 'more', 'can', 'him', 'were', "you've", 'further', 'they', 'isn', 'the', 'itself', 'we', 'ours', 'be', 'aren', 'myself', 'is', 'did', 'against', 'own', 'after', 'himself', 'than', 'here', 'such', 'o', 'couldn', "aren't", 'about', 'mightn', "isn't", 'haven', 'its', 'are', 'few', 'have', 'them', 'will', 'whom', 'ourselves', 'there', 'off', 'once', 'who', 'nor', 'only', 'so', 'no', 'by', 'been', 'on', 'was', 'yours', 'into', 'before', 'it', 'through', "you're", 'any', 'don', 'then', 'd', 'same', 'didn', 'yourselves', 'or', 'to', 'while', 'theirs', 'over', "she's", 'am', "you'd", 'and', 'because', 'my', 'hadn', 'if', 'herself', 'needn', 'ain', 'me', 'their', 'had', 'themselves', "needn't", 'm', 'these', 'out', 'hers', 'your', 'for', 'her', 'do', 'this', 'those', 'from', 't', 'now', 'an', 'hasn', 'below', 'what', 'y', "hasn't", 'his', 'very', 're', 'again', 'our', 'down', 'of', "it's", 'doesn',

In [None]:
# For Input
df['input_tokens'] = df['input_tokens'].apply(lambda x: [word for word in x if not word in stopwords_list])

# For Output
df['summary_tokens'] = df['summary_tokens'].apply(lambda x:[word for word in x if not word in stopwords_list])

df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,"[ujjagar, singh, appellant, herein, resident, ...","[facts, ujjagar, singh, appellant, herein, tri..."
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,"[appeal, special, leave, directed, judgment, o...","[facts, appeal, special, leave, directed, judg..."
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,"[interpretation, application, provisions, guja...","[facts, government, gujarat, exercise, power, ..."
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,"[state, manipur, appeal, us, questioning, judg...","[facts, one, shri, j, tayeng, revenue, commiss..."
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,"[two, appeals, involve, identical, questions, ...","[facts, appellants, question, correctness, jud..."


In [None]:
!pip install contractions



In [None]:
# Expanding the contractions(don't -> do not)

import contractions

def expand_contractions(tokens):
    return [contractions.fix(token) for token in tokens]

In [None]:
df['input_tokens'] = df['input_tokens'].apply(expand_contractions)
df['summary_tokens'] = df['summary_tokens'].apply(expand_contractions)

In [None]:
df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,"[ujjagar, singh, appellant, herein, resident, ...","[facts, ujjagar, singh, appellant, herein, tri..."
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,"[appeal, special, leave, directed, judgment, o...","[facts, appeal, special, leave, directed, judg..."
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,"[interpretation, application, provisions, guja...","[facts, government, gujarat, exercise, power, ..."
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,"[state, manipur, appeal, us, questioning, judg...","[facts, one, shri, j, tayeng, revenue, commiss..."
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,"[two, appeals, involve, identical, questions, ...","[facts, appellants, question, correctness, jud..."


## **Step-05: Lemmatization**


In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
# List of exceptions
exceptions = ['rs','was','as','has','ms','vs']

In [None]:
df['input_tokens'] = df['input_tokens'].apply(lambda x: ' '.join([lemmatizer.lemmatize(i) for i in x]))
df['summary_tokens'] = df['summary_tokens'].apply(lambda x: ' '.join([lemmatizer.lemmatize(i) for i in x]))

df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,ujjagar singh appellant herein resident villag...,fact ujjagar singh appellant herein tried conv...
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,appeal special leave directed judgment order l...,fact appeal special leave directed judgment or...
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,interpretation application provision gujarat t...,fact government gujarat exercise power conferr...
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,state manipur appeal u questioning judgment or...,fact one shri j tayeng revenue commissioner go...
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,two appeal involve identical question therefor...,fact appellant question correctness judgment r...


## **Step-06: Splitting and Modelling**

In [None]:
df.drop(['input','expected_output'],axis = 1,inplace = True)
df.head(5)

Unnamed: 0,input_tokens,summary_tokens
0,ujjagar singh appellant herein resident villag...,fact ujjagar singh appellant herein tried conv...
1,appeal special leave directed judgment order l...,fact appeal special leave directed judgment or...
2,interpretation application provision gujarat t...,fact government gujarat exercise power conferr...
3,state manipur appeal u questioning judgment or...,fact one shri j tayeng revenue commissioner go...
4,two appeal involve identical question therefor...,fact appellant question correctness judgment r...


In [None]:
df.to_csv(r'/content/drive/MyDrive/Datasets/new.csv')

In [None]:
# !pip install transformers==4.30

In [None]:
!pip install datasets



#**Load the processed dataset from drive**

In [None]:
from datasets import load_dataset

dataset_file = '/content/drive/MyDrive/Datasets/new.csv'

dataset = load_dataset('csv', data_files=dataset_file, split='train'-+)

dataset = dataset.train_test_split(test_size=0.30)
train_dataset = dataset['train']
val_dataset = dataset['test']

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
val_dataset

Dataset({
    features: ['Unnamed: 0', 'input_tokens', 'summary_tokens'],
    num_rows: 8
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pprint

pp = pprint.PrettyPrinter()

In [None]:
# Loading the pre-trained model and its respective tokenizer
model_id="google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
df = pd.read_csv(dataset_file)

source_text = df['input_tokens']
target_text = df['summary_tokens']

from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["input_tokens"], truncation=True), batched=True, remove_columns=["input_tokens", "summary_tokens"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary_tokens"], truncation=True), batched=True, remove_columns=["input_tokens", "summary_tokens"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Max target length: 512


In [None]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,input_tokens,summary_tokens
0,0,ujjagar singh appellant herein resident villag...,fact ujjagar singh appellant herein tried conv...
1,1,appeal special leave directed judgment order l...,fact appeal special leave directed judgment or...
2,2,interpretation application provision gujarat t...,fact government gujarat exercise power conferr...
3,3,state manipur appeal u questioning judgment or...,fact one shri j tayeng revenue commissioner go...
4,4,two appeal involve identical question therefor...,fact appellant question correctness judgment r...


In [None]:
def preprocess_function(sample, padding="max_length"):
    inputs = ["summarize the text with facts and indian law sections applied: " + item[:512] for item in sample["input_tokens"]]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["summary_tokens"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input_tokens", "summary_tokens"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['Unnamed: 0', 'input_ids', 'attention_mask', 'labels']


In [None]:
!pip install evaluate



In [None]:
!pip install rouge_score



In [None]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 10, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [None]:
!pip install accelerate>=0.21.0 -U

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/trained/model",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=50,
    predict_with_generate=True,
    fp16=False,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


In [None]:
trainer.train()

Step,Training Loss
500,4.5651


TrainOutput(global_step=900, training_loss=4.412215440538194, metrics={'train_runtime': 5317.0221, 'train_samples_per_second': 0.169, 'train_steps_per_second': 0.169, 'total_flos': 167301454233600.0, 'train_loss': 4.412215440538194, 'epoch': 50.0})

In [None]:
tokenizer.save_pretrained("/content/drive/MyDrive/trained/model")
model.save_pretrained("/content/drive/MyDrive/trained/model")

In [None]:
trainer.evaluate()

{'eval_loss': 4.446713924407959,
 'eval_rouge1': 0.5668,
 'eval_rouge2': 0.4113,
 'eval_rougeL': 0.5312,
 'eval_rougeLsum': 0.5294,
 'eval_gen_len': 19.0,
 'eval_runtime': 23.1445,
 'eval_samples_per_second': 0.346,
 'eval_steps_per_second': 0.346,
 'epoch': 50.0}

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="/content/drive/MyDrive/trained/model", device=torch.device('cuda'), max_length = 100)

# select a random test sample
sample = dataset['test'][2]
print(f"dialogue: \n{sample['input_tokens']}\n---------------")
print(len(sample['input_tokens'].split()))

# Split the input into batches of 512 tokens
input_tokens = sample["input_tokens"]
batch_size = 512
num_batches = (len(input_tokens) + batch_size - 1) // batch_size
summaries = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(input_tokens))
    batch_input = input_tokens[start_idx:end_idx]

    # summarize dialogue
    res = summarizer(batch_input)

    summaries.append(res[0]['summary_text'])

# Concatenate the summaries
full_summary = " ".join(summaries)

In [None]:
print(f"Summary:\n{full_summary}")

In [None]:
# Remove "summary:" from each line
cleaned_text = "\n".join([line.replace("summary:", "") for line in full_summary.strip().split("\n")])
print(cleaned_text)

In [None]:
print(len(cleaned_text.split()))

## **Finding the important sections**

In [None]:
pattern = r'\b(?:sc|section|act)\s+\d+\b'
matches = set(re.findall(pattern, cleaned_text))

# Store formatted sections in a set
formatted_sections_set = set()
for match in matches:
    sections = re.findall(r'\d{1,3}', match)
    formatted_sections = "\n".join([f"{match.split()[0]} {section}" for section in sections])
    formatted_sections_set.add(formatted_sections)

#Generate clickable links for each section
formatted_links = set()
for formatted_sections in formatted_sections_set:
    sections = re.findall(r'\d{1,3}', formatted_sections)
    for section in sections:
        link = f"<a href='https://www.google.com/search?q=Indian%20Legal%20section%20{section}' target='_blank'>section {section}</a>"
        formatted_links.add(link)

# Display the clickable links
print("Important Indian Legal Sections mentioned in the document: \n")
for link in formatted_links:
    print(link)