In [None]:
#| default_exp scripts/downloader_files_split_calculate

In [None]:
#| export
import argparse
import sys
from typing import List
import pandas as pd
import tiktoken

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [None]:
#| export
MAX_FILE_SIZE = 512 * 1024 * 1024
MAX_FILE_TOKENS = 2000000
TOKENIZER_MODEL = "gpt-4-turbo-preview"

In [None]:
#| export
def _calculate_splits(df: pd.DataFrame, encoding: tiktoken.Encoding) -> List[int]:
    df["size_bytes"] = (df["content"] + "\n\n").apply(lambda text: len(text.encode("utf-8")))
    df["size_tokens"] = (df["content"] + "\n\n").apply(lambda text: len(encoding.encode(text)))
    cum_size_bytes = 0
    cum_size_tokens = 0
    file_idx = 0
    files = []
    for _, row in df.iterrows():
        cum_size_bytes += row["size_bytes"]
        cum_size_tokens += row["size_tokens"]
        if cum_size_bytes > MAX_FILE_SIZE or cum_size_tokens > MAX_FILE_TOKENS:
            file_idx += 1
            cum_size_bytes = row["size_bytes"]
            cum_size_tokens = row["size_tokens"]
        files.append(file_idx)
    return pd.Series(files, index=df.index)


def process(file_name_content: str, file_name_splits: str) -> None:
    df = pd.read_json(file_name_content, orient="records", lines=True)
    encoding = tiktoken.encoding_for_model(TOKENIZER_MODEL)
    df["split"] = _calculate_splits(df, encoding)
    df[["url", "split"]].to_json(file_name_splits, orient="records", lines=True)

In [None]:
process(
    "../data/urls--downloaded-markdown.jsonl",
    "../data/urls--downloaded-markdown--splits.jsonl",
)

In [None]:
# | export
if __name__ == "__main__" and "ipykernel_launcher" not in " ".join(sys.argv):
    parser = argparse.ArgumentParser()
    parser.add_argument("--file_name_content",
                        type=str,
                        required=True,
                        help="JSONL file with downloaded Markdown")
    parser.add_argument("--file_name_splits",
                        type=str,
                        required=True,
                        help="JSONL file with calculated splits")
    process(**vars(parser.parse_args()))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()