# LLM Fine-tuning Datasets Generation from Documents
This notebook generates QA datasets from [Nike's annual public SEC report](https://s1.q4cdn.com/806093406/files/doc_downloads/2023/414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf).

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [None]:
import os
import sys
import boto3
from langchain_community.document_loaders import PyPDFLoader

sys.path.append(os.pardir)
from core import (
    ChatModelId,
    get_llm,
    QaDatasetGenerator,
    QaDatasetValidator,
    save_docs_to_jsonl,
)

Define file paths and AWS configurations.

In [None]:
PDF_FILE_PATH = "../assets/nke-10k-2023.pdf"
DOCS_FILE_PATH = "../assets/nke-10k-2023.jsonl"

TRAIN_DATASET_PATH = "../assets/train_dataset.jsonl"
VALIDATION_DATASET_PATH = "../assets/validation_dataset.jsonl"

REGION_NAME = "us-west-2"
S3_BUCKET_NAME = "<YOUR-S3-BUCKET-NAME>"

boto_session = boto3.Session(region_name=REGION_NAME)

### Step 1: Load the PDF document and save to a JSONL file

In [None]:
loader = PyPDFLoader(PDF_FILE_PATH)
documents = loader.load()

save_docs_to_jsonl(documents, DOCS_FILE_PATH)

### Step 2: Initialize the LLM and create a QA dataset generator

In [None]:
llm = get_llm(
    ChatModelId.CLAUDE_V3_5_SONNET,
    region_name=REGION_NAME,
)

qa_dataset_generator = QaDatasetGenerator.from_jsonl(llm, DOCS_FILE_PATH)

### Step 3: Generate, save and upload training dataset

In [None]:
train_dataset = qa_dataset_generator.generate(dataset_type="train")

_ = qa_dataset_generator.save_and_upload(
    train_dataset,
    TRAIN_DATASET_PATH,
    boto_session=boto_session,
    bucket_name=S3_BUCKET_NAME,
)

### Step 4: Generate, save and upload validation dataset

In [None]:
validation_dataset = qa_dataset_generator.generate(
    sampling_rate=0.2,
    dataset_type="validation",
)

_ = qa_dataset_generator.save_and_upload(
    validation_dataset,
    VALIDATION_DATASET_PATH,
    boto_session=boto_session,
    bucket_name=S3_BUCKET_NAME,
)

### Step 5: Validate training and validation datasets for fine-tuning

In [None]:
qa_dataset_validator = QaDatasetValidator()

qa_dataset_validator.validate_data(TRAIN_DATASET_PATH, VALIDATION_DATASET_PATH)