In [2]:
import os
import json
import nltk.data
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [3]:
# Initializing NLTK sentence tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [10]:
# Loading a suitable SentenceTransformers model 
model = SentenceTransformer('all-mpnet-base-v2')






Downloading pytorch_model.bin:   0%|                                                                   | 0.00/438M [00:00<?, ?B/s][A[A[A[A[A




Downloading pytorch_model.bin:   2%|█▍                                                         | 10.5M/438M [00:27<18:24, 387kB/s][A[A[A[A[A




Downloading pytorch_model.bin:   2%|█▍                                                         | 10.5M/438M [00:42<18:24, 387kB/s][A[A[A[A[A




Downloading pytorch_model.bin:   5%|██▊                                                        | 21.0M/438M [00:51<16:48, 413kB/s][A[A[A[A[A




Downloading pytorch_model.bin:   5%|██▊                                                        | 21.0M/438M [01:02<16:48, 413kB/s][A[A[A[A[A




Downloading pytorch_model.bin:   7%|████▏                                                      | 31.5M/438M [01:09<14:12, 477kB/s][A[A[A[A[A




Downloading pytorch_model.bin:   7%|████▏                                                      

In [11]:
# Function to split text into chunks of 5 sentences
def split_text_into_chunks(text):
    sentences = tokenizer.tokenize(text)
    return [sentences[i:i + 5] for i in range(0, len(sentences), 5)]

In [12]:
# Directory containing the corpus
corpus_dir = 'Corpus'

In [16]:
# Create a list to store data
data = []

In [17]:
# Looping through each judgment file in the corpus
for filename in os.listdir(corpus_dir):
    if filename.endswith('_Technical.txt'):
        # Extracting passages from the judgment file
        with open(os.path.join(corpus_dir, filename), 'r', encoding='utf-8') as file:
            judgment_text = file.read()
            passages = [passage.strip() for passage in judgment_text.split('__paragraph__') if passage.strip()]
            
            # Combining passages into one and split into chunks
            combined_passage = ' '.join(passages)
            passage_chunks = split_text_into_chunks(combined_passage)

        # Extracting metadata from the corresponding metadata file
        metadata_filename = filename.replace('_Technical.txt', '_Metadata.json')
        metadata_path = os.path.join(corpus_dir, metadata_filename)
        with open(metadata_path, 'r', encoding='utf-8') as metadata_file:
            metadata = json.load(metadata_file)

        # Generate embeddings for each passage chunk and append to the data list
        for passage_chunk in passage_chunks:
            embedding = model.encode(' '.join(passage_chunk), convert_to_tensor=True)
            data.append({'Passage': ' '.join(passage_chunk), 'Metadata': metadata, 'Embedding': embedding})

In [18]:
# Creating a DataFrame from the list of data
df = pd.DataFrame(data)

In [19]:
# Save the data to a CSV file
df.to_csv('passage_metadata_emb.csv', index=False)

Downloading pytorch_model.bin:   0%|                                                                 | 0.00/438M [1:02:31<?, ?B/s]
Downloading pytorch_model.bin:  46%|█████████████████████████▍                             | 41.9M/90.9M [54:36<1:03:44, 12.8kB/s]
Downloading pytorch_model.bin:  35%|███████████████████                                    | 31.5M/90.9M [43:26<1:22:04, 12.1kB/s]
Downloading pytorch_model.bin:   2%|█▎                                                     | 10.5M/438M [39:51<27:04:56, 4.39kB/s]
Downloading pytorch_model.bin:  36%|████████████████████▍                                    | 157M/438M [37:07<1:06:14, 70.6kB/s]
