In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

## **Step-0: Importing Packages**

In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import warnings
import torch
warnings.filterwarnings('ignore')

In [4]:
torch.cuda.is_available()

False

In [5]:
torch.cuda.device_count()

0

In [6]:
# torch.cuda.get_device_name(0)

In [7]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
df = pd.read_csv("dataset.csv")
df.head(5)

Unnamed: 0,input,expected_output
0,ujjagar singh the appellant herein a resident ...,"FACTS\nujjagar singh, the appellant herein, wa..."
1,this appeal with special leave is directed aga...,FACTS\nthis appeal with special leave is direc...
2,interpretation and or application of the provi...,FACTS\nthe government of gujarat in exercise o...
3,the state of manipur is in appeal before us qu...,FACTS\none shri a.j.tayeng was the revenue com...
4,these two appeals involve identical questions ...,FACTS\nthe appellants question correctness of ...


## **Step-01: Trimming the dataset**

In [9]:
df.drop(df.index[25:49],axis = 0, inplace = True)

In [10]:
df.shape

(26, 2)

## **Step-02: Text Preprocessing**

In [11]:
#Converting to lower case
df['input'] = df['input'].apply(lambda x: x.lower())
df['expected_output'] = df['expected_output'].apply(lambda x: x.lower())

In [12]:
#Replacing punctuations with space
import re
df['input'] = df['input'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))
df.head(5)

Unnamed: 0,input,expected_output
0,ujjagar singh the appellant herein a resident ...,"facts\nujjagar singh, the appellant herein, wa..."
1,this appeal with special leave is directed aga...,facts\nthis appeal with special leave is direc...
2,interpretation and or application of the provi...,facts\nthe government of gujarat in exercise o...
3,the state of manipur is in appeal before us qu...,facts\none shri a.j.tayeng was the revenue com...
4,these two appeals involve identical questions ...,facts\nthe appellants question correctness of ...


In [13]:
#Removal of special characters with space(including \n)
df['expected_output'] = df['expected_output'].apply(lambda x: re.sub(r'[^a-zA-Z0-9]',' ',x))
df.head(5)

Unnamed: 0,input,expected_output
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...


## **Step-03: Tokenization**

In [14]:
# Tokenizing the input values
df['input_tokens'] = df['input'].apply(lambda x: word_tokenize(x))

# Tokenizing the expected summary values
df['summary_tokens'] = df['expected_output'].apply(lambda x: word_tokenize(x))
df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,"[ujjagar, singh, the, appellant, herein, a, re...","[facts, ujjagar, singh, the, appellant, herein..."
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,"[this, appeal, with, special, leave, is, direc...","[facts, this, appeal, with, special, leave, is..."
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,"[interpretation, and, or, application, of, the...","[facts, the, government, of, gujarat, in, exer..."
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,"[the, state, of, manipur, is, in, appeal, befo...","[facts, one, shri, a, j, tayeng, was, the, rev..."
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,"[these, two, appeals, involve, identical, ques...","[facts, the, appellants, question, correctness..."


## **Step-04: Stopword Removal**


In [15]:
stopwords_list = set(stopwords.words('english'))
print("Original Length of Stopwords List:",len(stopwords_list))
print(stopwords_list)

Original Length of Stopwords List: 179
{'from', 'their', 'me', 'was', 'the', 'to', 'we', 'more', 'any', 'been', 'why', 'not', 'doesn', "you'd", 'has', 'were', 'during', 're', 'himself', 'these', 'hasn', 'her', 'his', 'ma', 'didn', 'for', 'did', 'and', 'don', 'each', 'weren', 'which', 'aren', 'how', 'against', 'is', 'few', 'hers', 'have', 'won', 'wouldn', 'such', 'will', "it's", 'both', 'other', "doesn't", "needn't", 'just', 'off', 'should', "don't", 'own', 't', 'o', 'ain', 'with', 'll', 'theirs', 'all', 'y', 'them', 'yourselves', "haven't", 'now', 'him', 'about', 'mightn', "you've", 'what', 'if', 'so', 'do', 'into', 'yours', 'doing', 'itself', 'no', 'mustn', 'they', 'haven', "that'll", 'as', "hadn't", 'once', 'further', 'below', "wasn't", 'above', 'or', 'there', "couldn't", 'while', 'herself', 'ourselves', 'shouldn', 'out', 'through', 'does', 'm', 'at', 'she', 'over', 'my', 'very', 'can', 'nor', 'because', 'am', 'it', 'of', 'but', 'shan', 'i', "she's", 'only', "you're", 'this', 'had', 

In [16]:
# Trimming down the stopwords list
exclude_words = ["not","don't", 'should', "should've", "mightn't", 'mustn', "mustn't",'shouldn',
                "shouldn't", 'wasn', "wasn't", 'weren', "weren't", "won't", 'wouldn', "wouldn't",
                'when', 'where', 'why', 'how',"couldn't","didn't","doesn't","hadn't","haven't",]
stopwords_list = list(filter(lambda x: x not in exclude_words,stopwords_list))
print("New Length of Stopwords List:",len(stopwords_list))
print(stopwords_list)

New Length of Stopwords List: 154
['from', 'their', 'me', 'was', 'the', 'to', 'we', 'more', 'any', 'been', 'doesn', "you'd", 'has', 'were', 'during', 're', 'himself', 'these', 'hasn', 'her', 'his', 'ma', 'didn', 'for', 'did', 'and', 'don', 'each', 'which', 'aren', 'against', 'is', 'few', 'hers', 'have', 'won', 'such', 'will', "it's", 'both', 'other', "needn't", 'just', 'off', 'own', 't', 'o', 'ain', 'with', 'll', 'theirs', 'all', 'y', 'them', 'yourselves', 'now', 'him', 'about', 'mightn', "you've", 'what', 'if', 'so', 'do', 'into', 'yours', 'doing', 'itself', 'no', 'they', 'haven', "that'll", 'as', 'once', 'further', 'below', 'above', 'or', 'there', 'while', 'herself', 'ourselves', 'out', 'through', 'does', 'm', 'at', 'she', 'over', 'my', 'very', 'can', 'nor', 'because', 'am', 'it', 'of', 'but', 'shan', 'i', "she's", 'only', "you're", 'this', 'had', "isn't", 'by', 'most', 'd', 'on', 'a', 'then', 'up', "aren't", 'again', 'couldn', 'ours', 'yourself', 'isn', 'some', 'you', 've', 'until',

In [17]:
# For Input
df['input_tokens'] = df['input_tokens'].apply(lambda x: [word for word in x if not word in stopwords_list])

# For Output
df['summary_tokens'] = df['summary_tokens'].apply(lambda x:[word for word in x if not word in stopwords_list])

df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,"[ujjagar, singh, appellant, herein, resident, ...","[facts, ujjagar, singh, appellant, herein, tri..."
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,"[appeal, special, leave, directed, judgment, o...","[facts, appeal, special, leave, directed, judg..."
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,"[interpretation, application, provisions, guja...","[facts, government, gujarat, exercise, power, ..."
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,"[state, manipur, appeal, us, questioning, judg...","[facts, one, shri, j, tayeng, revenue, commiss..."
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,"[two, appeals, involve, identical, questions, ...","[facts, appellants, question, correctness, jud..."


In [18]:
# Expanding the contractions(don't -> do not)

import contractions

def expand_contractions(tokens):
    return [contractions.fix(token) for token in tokens]

In [19]:
df['input_tokens'] = df['input_tokens'].apply(expand_contractions)
df['summary_tokens'] = df['summary_tokens'].apply(expand_contractions)

In [20]:
df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,"[ujjagar, singh, appellant, herein, resident, ...","[facts, ujjagar, singh, appellant, herein, tri..."
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,"[appeal, special, leave, directed, judgment, o...","[facts, appeal, special, leave, directed, judg..."
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,"[interpretation, application, provisions, guja...","[facts, government, gujarat, exercise, power, ..."
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,"[state, manipur, appeal, us, questioning, judg...","[facts, one, shri, j, tayeng, revenue, commiss..."
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,"[two, appeals, involve, identical, questions, ...","[facts, appellants, question, correctness, jud..."


## **Step-05: Lemmatization**


In [21]:
lemmatizer = WordNetLemmatizer()

In [22]:
# List of exceptions
exceptions = ['rs','was','as','has','ms','vs']

In [23]:
df['input_tokens'] = df['input_tokens'].apply(lambda x: ' '.join([lemmatizer.lemmatize(i) for i in x]))
df['summary_tokens'] = df['summary_tokens'].apply(lambda x: ' '.join([lemmatizer.lemmatize(i) for i in x]))

df.head(5)

Unnamed: 0,input,expected_output,input_tokens,summary_tokens
0,ujjagar singh the appellant herein a resident ...,facts ujjagar singh the appellant herein was...,ujjagar singh appellant herein resident villag...,fact ujjagar singh appellant herein tried conv...
1,this appeal with special leave is directed aga...,facts this appeal with special leave is direct...,appeal special leave directed judgment order l...,fact appeal special leave directed judgment or...
2,interpretation and or application of the provi...,facts the government of gujarat in exercise of...,interpretation application provision gujarat t...,fact government gujarat exercise power conferr...
3,the state of manipur is in appeal before us qu...,facts one shri a j tayeng was the revenue comm...,state manipur appeal u questioning judgment or...,fact one shri j tayeng revenue commissioner go...
4,these two appeals involve identical questions ...,facts the appellants question correctness of t...,two appeal involve identical question therefor...,fact appellant question correctness judgment r...


## **Step-06: Splitting and Modelling**

In [24]:
df.drop(['input','expected_output'],axis = 1,inplace = True)
df.head(5)

Unnamed: 0,input_tokens,summary_tokens
0,ujjagar singh appellant herein resident villag...,fact ujjagar singh appellant herein tried conv...
1,appeal special leave directed judgment order l...,fact appeal special leave directed judgment or...
2,interpretation application provision gujarat t...,fact government gujarat exercise power conferr...
3,state manipur appeal u questioning judgment or...,fact one shri j tayeng revenue commissioner go...
4,two appeal involve identical question therefor...,fact appellant question correctness judgment r...


In [25]:
df.to_csv(r'new.csv')

In [26]:
# !pip install transformers==4.30

#**Load the processed dataset from drive**

In [42]:
from datasets import load_dataset

dataset_file = 'new.csv'

dataset = load_dataset('csv', data_files=dataset_file, split='train')

dataset = dataset.train_test_split(test_size=0.30)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [43]:
val_dataset

Dataset({
    features: ['Unnamed: 0', 'input_tokens', 'summary_tokens'],
    num_rows: 8
})

In [44]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pprint

pp = pprint.PrettyPrinter()

In [76]:
# Loading the pre-trained model and its respective tokenizer
model_id="google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)


In [77]:
df = pd.read_csv(dataset_file)

source_text = df['input_tokens']
target_text = df['summary_tokens']

from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["input_tokens"], truncation=True), batched=True, remove_columns=["input_tokens", "summary_tokens"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary_tokens"], truncation=True), batched=True, remove_columns=["input_tokens", "summary_tokens"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Max source length: 512


Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Max target length: 512


In [78]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,input_tokens,summary_tokens
0,0,ujjagar singh appellant herein resident villag...,fact ujjagar singh appellant herein tried conv...
1,1,appeal special leave directed judgment order l...,fact appeal special leave directed judgment or...
2,2,interpretation application provision gujarat t...,fact government gujarat exercise power conferr...
3,3,state manipur appeal u questioning judgment or...,fact one shri j tayeng revenue commissioner go...
4,4,two appeal involve identical question therefor...,fact appellant question correctness judgment r...


In [79]:
def preprocess_function(sample, padding="max_length"):
    inputs = ["summarize: " + item[:512] for item in sample["input_tokens"]]
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=sample["summary_tokens"], max_length=max_target_length, padding=padding, truncation=True)

    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["input_tokens", "summary_tokens"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['Unnamed: 0', 'input_ids', 'attention_mask', 'labels']


In [80]:
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
nltk.download("punkt")

# Metric
metric = evaluate.load("rouge")

# helper function to postprocess text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v, 2) for k, v in result.items()}  # Convert scores to percentages
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result



[nltk_data] Downloading package punkt to C:\Users\Ayush
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [81]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)


In [82]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./trained/model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, 
    learning_rate=1e-4,
    num_train_epochs=5,
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments


# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir="./trained/model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, 
    learning_rate=5e-5,
    num_train_epochs=5,
    logging_strategy="steps",
    logging_steps=500,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)


In [83]:
trainer.train()

  0%|          | 0/15 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [38]:
tokenizer.save_pretrained("./trained/model")
model.save_pretrained("./trained/model")

In [39]:
trainer.evaluate()

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.4621164798736572,
 'eval_rouge1': 6.037,
 'eval_rouge2': 4.988,
 'eval_rougeL': 5.5817,
 'eval_rougeLsum': 5.5709,
 'eval_gen_len': 19.0,
 'eval_runtime': 22.7002,
 'eval_samples_per_second': 0.352,
 'eval_steps_per_second': 0.044,
 'epoch': 10.0}

In [40]:
from transformers import pipeline

summarizer = pipeline("summarization", model="./trained/model", device=torch.device('cpu'))

# select a random test sample
sample = dataset['test'][4]
print(f"dialogue: \n{sample['input_tokens']}\n---------------")
print(len(sample['input_tokens'].split()))

# summarize dialogue
res = summarizer(sample["input_tokens"])

print(f"flan-t5-base summary:\n{res[0]['summary_text']}")
print(len(res[0]['summary_text'].split()))


Token indices sequence length is longer than the specified maximum sequence length for this model (5588 > 512). Running this sequence through the model will result in indexing errors


dialogue: 
appeal directed final judgment order dated 03 10 2007 passed high court judicature andhra pradesh hyderabad criminal appeal 436 2001 whereby high court dismissed appeal filed appellant herein confirmed judgment dated 19 03 2001 passed special judge c b case visakhapatnam c c 2 1998 brief fact appellant accused working head clerk traffic cadre section office senior divisional personnel officer south central railway vijayawada period april1992 november1997 nature duty appellant accused included dealing processing matter like promotion transfer seniority list roster list pay fixation promotion retirement resignation etc personnel one k rama rao complainant examined pw 1was posted yard point man grade station superintendent south central railway tanuku december1995 june1997 june1997due excess staff tanuku instructed report head quarter vijayawada accordingly when reported asked go back tanuku thereafter went back tanuku where subsequently transferred rajahmundry thereafter pw 1 

In [41]:
print(len(dataset["test"][4]['input_tokens'].split()))
print(len(dataset["test"][4]['summary_tokens'].split()))

3664
1271
