## Chunk legal judgment documents

##### Install Prerequisites

In [None]:
%%capture

!pip install tiktoken==0.3.3

#### Imports 

In [2]:
import tiktoken
import requests
import logging
import os

##### Setup logging

In [3]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [4]:
logger.info(f'Using requests=={requests.__version__}')

Using requests==2.28.2


#### Setup essentials 

##### Initialize encoder
`cl100k_base` is the encoding used by OpenAI's `gpt-4`, `gpt-3.5-turbo` and `text-embedding-ada-002` models

In [16]:
encoding = tiktoken.get_encoding('cl100k_base')

In [17]:
DOC_DIR_PATH = './data/docs'

#### Encode chunks using Tiktoken and determine token count

In [22]:
def num_tokens_from_doc(doc: str) -> int:
    """
    Returns the number of tokens in a text string.
    """
    num_tokens = len(encoding.encode(doc))
    return num_tokens

In [45]:
CHUNK_SIZE = 768

In [46]:
def doc_iterator(dir_path: str):
    for filename in os.listdir(dir_path):
        file_path = os.path.join(dir_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r') as file:
                file_contents = file.read()
                yield filename, file_contents

In [47]:
for doc_name, doc in doc_iterator(DOC_DIR_PATH):
    doc_id = doc_name.split('.')[0]
    tokens = encoding.encode(doc)
    chunks = []
    chunk_id = 1
    for i in range(0, len(tokens), CHUNK_SIZE):
        chunk_tokens = tokens[i: i+CHUNK_SIZE]
        chunk = encoding.decode(chunk_tokens)
        with open(f'./data/chunks/{doc_id}_{chunk_id}', 'w') as f:
            f.write(chunk)
        chunk_id += 1