In [None]:
!pip install -q sentence-transformers tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import json
import os

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torch
print("GPU available:", torch.cuda.is_available())
print("Using device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

GPU available: True
Using device: cuda


In [None]:
INPUT_JSONL_PATH = "/content/drive/MyDrive/judgments_chunks_cleaned_unique.jsonl"
OUTPUT_FILE_PREFIX = "/content/drive/MyDrive/embedding_output_part"
LOG_FILE = "/content/drive/MyDrive/last_chunk_index.txt"

MAX_OUTPUT_MB = 5000  # ~5GB
CHUNK_SAVE_INTERVAL = 500

In [None]:
def read_jsonl_from_offset(filepath, skip_lines=0):
    with open(filepath, 'r') as f:
        for _ in range(skip_lines):
            next(f)
        for line in f:
            yield json.loads(line)

In [None]:
def validate_metadata_field(field_value, field_name, allow_empty=False):
    if isinstance(field_value, list):
        if not field_value:
            if allow_empty:
                return "None"
            raise ValueError(f"Metadata field '{field_name}' is an empty list.")
        return ", ".join(str(v) for v in field_value)

    elif isinstance(field_value, (str, int, float, bool)):
        if (field_value is None or field_value == "") and not allow_empty:
            raise ValueError(f"Metadata field '{field_name}' is empty.")
        return field_value if field_value != "" else "None" if allow_empty else field_value

    else:
        raise ValueError(f"Metadata field '{field_name}' has unsupported type: {type(field_value)}")

In [None]:
def write_embeddings_to_file(input_path, output_path, start_index=0):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    current_index = 0
    bytes_written = 0
    output_file = open(output_path, 'w')

    with open(input_path, 'r') as f:
        total_chunks = sum(1 for _ in f)

    pbar = tqdm(total=total_chunks - start_index, desc="Embedding chunks", unit="chunk")

    for doc in read_jsonl_from_offset(input_path, start_index):
        try:
            embedding = model.encode(doc["chunk_text"]).tolist()

            # Metadata validation
            metadata = {
                "doc_id": validate_metadata_field(doc["doc_id"], "doc_id"),
                "case_title": validate_metadata_field(doc["case_title"], "case_title"),
                "date_of_judgment": validate_metadata_field(doc["date_of_judgment"], "date_of_judgment"),
                "citation": validate_metadata_field(doc["citation"], "citation", allow_empty=True),
                "bench": validate_metadata_field(doc["bench"], "bench", allow_empty=True),
                "article_references": validate_metadata_field(doc["article_references"], "article_references", allow_empty=True),
                "bench_strength": validate_metadata_field(doc["bench_strength"], "bench_strength"),
                "source_pdf": validate_metadata_field(doc["source_pdf"], "source_pdf"),
                "chunk_index": validate_metadata_field(doc["chunk_index"], "chunk_index"),
                "total_chunks": validate_metadata_field(doc["total_chunks"], "total_chunks"),
            }

            out_record = {
                "id": doc["chunk_id"],
                "embedding": embedding,
                "metadata": metadata,
                "document": doc["chunk_text"]
            }

            json_str = json.dumps(out_record) + "\n"
            output_file.write(json_str)
            bytes_written += len(json_str.encode("utf-8"))
            current_index += 1
            pbar.update(1)

            if current_index % CHUNK_SAVE_INTERVAL == 0:
                with open(LOG_FILE, "w") as log:
                    log.write(str(start_index + current_index))

            # if bytes_written >= max_output_size_mb * 1024 * 1024:
            #     break

        except Exception as e:
            print(f"Skipping chunk {doc.get('chunk_id', 'UNKNOWN')} due to error: {e}")
            continue

    output_file.close()
    pbar.close()

    with open(LOG_FILE, "w") as log:
        log.write(str(start_index + current_index))

    print(f"✔️ Saved up to chunk index: {start_index + current_index}")

In [None]:
if os.path.exists(LOG_FILE):
    with open(LOG_FILE, "r") as f:
        resume_index = int(f.read().strip())
else:
    resume_index = 0

output_file = f"{OUTPUT_FILE_PREFIX}_{resume_index}.jsonl"

write_embeddings_to_file(
    input_path=INPUT_JSONL_PATH,
    output_path=output_file,
    start_index=resume_index
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding chunks:   0%|          | 0/680253 [00:00<?, ?chunk/s]

✔️ Saved up to chunk index: 2150563
