# Initilization of LLM model

This will be an initial test of using (in this case Google's gemini model) as the baseline model for the RAG.

## Loading / Splitting Data

This will be done with DirectoryLoader from lanchain-community.document-loaders

In [3]:
import json
import os
from typing import List
from langchain.docstore.document import Document
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.base import BaseLoader
from pathlib import Path

class JSONLoader(BaseLoader):
    """
    Loads a JSON file, extracts the title and paragraphs, and creates
    a list of LangChain Document objects.
    Each Document object will contain a single paragraph and the
    original page title as metadata.
    """
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> List[Document]:
        with Path(self.file_path).open(encoding='utf-8') as f:
            data = json.load(f)

        title = data.get("title", "No Title")
        paragraphs = data.get("paragraphs", [])
        url = data.get("url", "No Url")

        documents = []
        for paragraph in paragraphs:
            documents.append(Document(
                page_content=paragraph,
                metadata={"source": url, "title": title}
            ))
        return documents

loader = DirectoryLoader(
    '../data', glob="**/*.json", loader_cls=JSONLoader
)

documents = loader.load()

print(f"Number of documents loaded: {len(documents)}")
if documents:
    print("\nExample Document:")
    print(f"Page Content: {documents[0].page_content[:100]}...")
    print(f"Metadata: {documents[0].metadata}")

Number of documents loaded: 57353

Example Document:
Page Content: [1][2]...
Metadata: {'source': 'https://splatoonwiki.org/wiki/Cometz_Octobrush', 'title': 'Cometz Octobrush'}
