### Script to generate fine tuning data using CLS similarity method

Change the datasets variable according to the requirements

In [1]:
import glob
from nltk import tokenize
import nltk
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch

def get_root_path():
    '''
    function to get root path of dataset

    change the path variable to the path of the dataset
    '''
    path = "/kaggle/input/legal-nlp/dataset"
    return path

def get_summary_data(dataset, train):
    '''
    function to get names, documents, and summaries

    change the path variable to the path of the dataset
    '''
    if dataset == "N2":
        path = get_root_path() + '/N2/Full-Text/India'
        all_files = glob.glob(path + "/*.txt")

        data_source = []
        names = []
        for filename in all_files:
            with open(filename, 'r') as f: 
                p = filename.rfind("/")
#                 print(filename[p+1:])
                names.append(filename[p+1:])
                a = f.read()
                data_source.append(a)
        return names, data_source, []
    
    path = get_root_path() + '/' + dataset+'-Abs' + '/' + train + '-data/judgement'
    all_files = glob.glob(path + "/*.txt")
    data_source = []
    names = []
    for filename in all_files:
        with open(filename, 'r') as f:
            p = filename.rfind("/")
            names.append(filename[p+1:])
            a = f.read()
            data_source.append(a)
    path = get_root_path()  + '/' + dataset+'-Abs' + '/' + train + '-data/summary'
    all_files = glob.glob(path + "/*.txt")
    data_summary = []
    for filename in all_files:
        with open(filename, 'r') as f: 
            a = f.read()
            l = len(a)
            data_summary.append(a)
            
    return names, data_source, data_summary

def get_summary_data_rhet_train(dataset):
    '''
    function to get names, documents, and summaries for Rhetorical labeled documents - train

    change the path variable to the path of the Rhetorical labeled dataset
    '''
    path = get_root_path() + '/rhet/' + dataset.lower() + '_ft_rhet' # use your path
    all_files = glob.glob(path + "/*.txt")

    data_source = []
    names = []
    for filename in all_files:
        with open(filename, 'r') as f: 
            p = filename.rfind("/")
            names.append(filename[p+1:])
            a = f.read()
            data_source.append(a)

    path = get_root_path() + '/rhet/RhetSumm_Dataset/raw_files/'+ dataset +'/summary' # use your path
    all_files = glob.glob(path + "/*.txt")

    data_summary = {}
    for filename in all_files:
        with open(filename, 'r') as f: 
            p = filename.rfind("/")
            a = f.read()
            l = len(a)
            data_summary[filename[p+1:]] = (a)
    return names, data_source, data_summary

def get_summary_data_rhet_test(dataset):
    '''
    function to get names, documents, and summaries for Rhetorical labeled documents - test

    change the path variable to the path of the Rhetorical labeled dataset
    '''
    path = get_root_path() + '/rhet/RhetSumm_Dataset/rhet/' + dataset + "/" # use your path
    all_files = glob.glob(path + "/*.txt")

    data_source = []
    names = []
    for filename in all_files:
        with open(filename, 'r') as f: 
            p = filename.rfind("/")
#             print(filename[p+1:])
            names.append(filename[p+1:])
            a = f.read()
            data_source.append(a)

    return names, data_source


def get_req_len_dict(dataset, istrain):
    '''
    function to required length data for each summary

    change the path variable to the path to Summary_Length_India.txt/ Summary_Length_Uk.txt file
    '''
    
    if dataset == "N2":
        f = open(get_root_path() + "/N2/Summary_Length_India.txt", "r")
        a = (f.read())
        a = a.split("\n")
        dict_names = {}
        for i in a:
            b = i.split("	")
            dict_names[b[0] + ".txt"] = int(b[1])
        return dict_names 
    
    f = open(get_root_path() + "/Summary-Data-"+ dataset +"/length-file-" + istrain +".txt", "r")
    a = (f.read())
    a = a.split("\n")
    dict_names = {}
    for i in a:
        b = i.split("	")
        try:
            tp = int(b[2])
            dict_names[b[0]] = tp
        except:
            print(b)
    return dict_names  

def split_to_sentences(para):
    sents = nltk.sent_tokenize(para)
    return sents

def nest_sentencesV2(document,chunk_length):
    '''
    function to chunk a document
    input:  document           - Input document
            chunk_length        - chunk length
    output: list of chunks. Each chunk is a list of sentences.
    '''
    nested = []
    sent = []
    length = 0
    for sentence in nltk.sent_tokenize(document):
        length += len(sentence.split(" "))
        if length < chunk_length:
            sent.append(sentence)
        else:
            nested.append(sent)
            sent = []
            sent.append(sentence)
            length = 0
    if len(sent)>0:
        nested.append(sent)
    return nested

def nest_sentencesMV2(document_sents,chunk_length):
    '''
    function to chunk a document
    input:  doc_sents           - Input document sentences
            chunk_length        - chunk length
    output: list of chunks. Each chunk is a list of sentences.
    '''
    nested = []
    sent = []
    length = 0
    #modeified v2
    
    for sentence in document_sents:
        length += len((sentence.split(" ")))
        if length < chunk_length:
            sent.append(sentence)
        else:
            nested.append(sent)
            sent = []
            sent.append(sentence)
            length = 0
    if len(sent)>0:
        nested.append(sent)
    return nested

def nest_sentences(document,chunk_length):
    '''
    function to chunk a document
    input:  document           - Input document
            chunk_length        - chunk length
    output: list of chunks. Each chunk is a string.
    '''
    nested = []
    sent = []
    length = 0
    for sentence in nltk.sent_tokenize(document):
        length += len(sentence.split(" "))
        if length < chunk_length:
            sent.append(sentence)
        else:
            nested.append(" ".join(sent))
            sent = []
            sent.append(sentence)
            length = 0
    if len(sent)>0:
        nested.append(" ".join(sent))
    return nested
  

def nest_sentencesV3(doc_sents,chunk_length, dict_sents_labels):
    '''
    function to first segment the document using rhetorical roles and then chunk if required
    input:  doc_sents           - Input document sentences
            chunk_length        - chunk length
            dict_sents_labels   - dictionary for every sentence and label
    output: list of chunks
    '''
    s = list(set(dict_sents_labels.values()))
#     print(s)
    all_chunks = []
    
    for label in s:
        doc_sents_withlabels = []
        for sent in doc_sents:
            if sent == '':continue
            if dict_sents_labels[sent] == label:
                doc_sents_withlabels.append(sent)
        chunks = nest_sentencesMV2(doc_sents_withlabels, chunk_length)
        
        edited_chunks = []
        for chunk in chunks:
            edited_chunks.append(["<" + label + ">"] + chunk)
        
        all_chunks = all_chunks + edited_chunks

    return all_chunks   

def get_doc_sens_and_labels(doc):
    '''
    Function to read a Rhetorically labeled document.
    returns a list of sentences, the label of each sentence, dictionary: keys-sentences, values-labels
    '''
    sents = []
    labels = []
    dict_sents_labels = {}
    ss = doc.split("\n")
    for i in ss:
        try:
            spt = i.split("\t")
            sents.append(spt[0])
            labels.append(spt[1])
            dict_sents_labels[spt[0]] = spt[1] 
        except:
            pass
    return sents, labels, dict_sents_labels



In [2]:
dataset = "IN" # Options: IN, UK 

In [3]:
import pandas as pd
import numpy as np
import glob
import json
import os
import sys
from tqdm import tqdm
sys.path.insert(0, '../')
# from utilities import *
from sklearn.metrics.pairwise import cosine_similarity
import torch

In [4]:
#Reading the documents and summaries 
names, data_source, data_summary = get_summary_data(dataset, "train")
print(len(names))
print(len(data_source))
print(len(data_summary))

7030
7030
7030


In [5]:
# Loading Model and tokenizer
from transformers import AutoTokenizer
from transformers import  BertModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert_model = BertModel.from_pretrained('bert-base-uncased',output_hidden_states = False).to("cuda")
bert_model.eval()

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [6]:
def get_sen_encoding(sents):
    '''
    Function to generate encoding for each word in the input list 
    input: sents - List of sentences
    returns the list of the sentence encoding 
    '''
    a = 0.001
    answer = None
    for sent in sents:
        ip =tokenizer(sent, return_tensors='pt', max_length=150, truncation=True, padding='max_length')
        tokens = tokenizer.convert_ids_to_tokens(ip['input_ids'][0])
        ip = ip.to("cuda")
        bert_output = bert_model(**ip)
        embedding = bert_output['pooler_output'].clone().detach()
        embedding = embedding.to("cpu")
        if answer == None:
            answer = embedding
            answer.resize_(1, 768)
        else:
            embedding.resize_(1, 768)
            answer = torch.cat((answer, embedding),0)
    return answer

In [7]:
def similarity_l_l(l1, l2):
    '''
    Function to find the most similar sentence in the document for each sentence in the summary 
    input:  l1 - Summary sentences
            l2 - Document sentences
    returns a list of document sentence indexes for each sentence in the summary 
    '''
    l = l1+l2
    sents_encodings = get_sen_encoding(l)
    similarities=cosine_similarity(sents_encodings)
    
    result = []
    for i in range(len(l1)):
        vals = similarities[i]
        vals = vals[len(l1):]
        idx = np.argmax(vals)
        result.append(idx)
    return result

In [8]:
def get_chunks_data_from_docV2(doc, summ):
    '''
    Function to generate chunks along with their summaries 
    input:  doc - legal Document
            summ - Gold standard summary
    returns a list of chunks and their summaries 
    '''
    chunk_summ_word_threshold = 150
    sentence_mapping = {}
    doc_sents = split_to_sentences(doc)
    summ_sents = split_to_sentences(summ)
    
    result = (similarity_l_l(summ_sents,doc_sents))
    
    for i in range(len(summ_sents)):
        sentence_mapping[doc_sents[result[i]]] = summ_sents[i]
    
    final_chunks = []
    final_summ = []
    for chunk in nest_sentencesV2(doc, 512):
        summ = ""
        for chunk_sent in chunk:
            if chunk_sent in sentence_mapping:
                summ = summ + sentence_mapping[chunk_sent]
        if len(tokenizer.tokenize(summ)) >= chunk_summ_word_threshold:
            final_chunks.append(" ".join(chunk))
            final_summ.append(summ)
    return final_chunks, final_summ


In [9]:
#loop to pass every document, generate the fine tuning data and saving in a excel file 
import pandas as pd
training_chunks = []
training_summs = []
for i in tqdm(range(len(data_source))):
    cks, summs = get_chunks_data_from_docV2(data_source[i],data_summary[i])
    training_chunks = training_chunks + cks
    training_summs = training_summs + summs
#     print(i, len(training_summs), end = ", ", sep = " : ")
    if i%100 == 0: 
        full = list(zip(training_chunks,training_summs))
        df = pd.DataFrame(full,columns=['data', 'summary'])
        df.to_excel("FD_"+dataset+"_CLS_BK.xlsx")
#         break
full = list(zip(training_chunks,training_summs))
df = pd.DataFrame(full,columns=['data', 'summary'])
df.to_excel("FD_"+dataset+"_CLS.xlsx")

  1%|▏         | 104/7030 [04:09<3:35:28,  1.87s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 7030/7030 [4:33:41<00:00,  2.34s/it]
