In [None]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# --- Configuration ---
API_KEY = os.environ.get("GOOGLE_API_KEY") # Replace with your key if not set as an env var
DB_FAISS_PATH = "faiss_index" # Define path to store the FAISS vector store

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-001", 
    google_api_key=API_KEY
)

llm_translator = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-001", 
    temperature=0, 
    google_api_key=API_KEY
)

llm_rag = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-001", 
    temperature=0, 
    google_api_key=API_KEY
)

# --- Vector Store Creation or Loading ---
vector_store = None

# Check if the vector store already exists on disk
if os.path.exists(DB_FAISS_PATH):
    print(f"Loading existing vector store from: {DB_FAISS_PATH}")
    # Load the vector store from disk
    # The `allow_dangerous_deserialization` flag is required for loading FAISS indexes.
    # This is safe here because we are the ones who created the file.
    vector_store = FAISS.load_local(
        DB_FAISS_PATH, 
        embeddings, 
        allow_dangerous_deserialization=True 
    )

else:
    print("Creating new vector store. This will happen only once.")
    
    print("Loading documents...")
    pdf_paths = ["/home/yslcoat/data/text_documents/arbeiderpartiets-partiprogram.pdf", "/home/yslcoat/data/text_documents/Hoyres-stortingsvalgprogram-bokmal-ensidig.pdf"]

    all_documents = []
    for path in pdf_paths:
        print(f"\nProcessing document: {path}")
        loader = PyPDFLoader(path)
        documents = loader.load()
        
        print(f"Translating {len(documents)} pages via Gemini API. This may take a few minutes...")
        for i, doc in enumerate(documents):
            original_text = doc.page_content
            
            prompt_text = f"Translate the following Norwegian text to English. Do not add any commentary, preamble, or notes. Output only the translated English text.\n\nNORWEGIAN TEXT:\n{original_text}"
            
            translated_text = llm_translator.invoke(prompt_text).content
            
            doc.page_content = translated_text
            
            print(f"  - Translated page {i + 1}/{len(documents)}")
        
        all_documents.extend(documents)

    print("\nSplitting translated documents into chunks...")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    all_chunks = text_splitter.split_documents(all_documents)

    print(f"Total document chunks created: {len(all_chunks)}")

    print("Creating embeddings with Gemini and storing in FAISS vector store...")
    vector_store = FAISS.from_documents(all_chunks, embeddings)
    
    # Save the newly created vector store to disk for future runs
    print(f"Saving vector store to disk at: {DB_FAISS_PATH}")
    vector_store.save_local(DB_FAISS_PATH)

# --- RAG Chain Definition ---
print("\nDefining the RAG chain...")
retriever = vector_store.as_retriever(search_kwargs={"k": 7})

prompt_template = """
You are a meticulous fact-checker. Your task is to verify the following claim based *only* on the provided context from translated political party policies.

Analyze the context and determine if the claim is TRUE, FALSE, or UNVERIFIABLE.

Provide a clear, one-word answer (TRUE, FALSE, or UNVERIFIABLE) followed by a brief, neutral explanation citing the relevant text from the context. Do not use any outside knowledge.

CONTEXT:
{context}

CLAIM:
{input}

ANSWER:
"""

prompt = ChatPromptTemplate.from_template(prompt_template)

rag_chain = (
    {"context": retriever, "input": RunnablePassthrough()}
    | prompt
    | llm_rag
)

# --- Application Ready ---
print("\n--- Gemini-Powered Fact-Checking Application Ready ---")

claim1 = "The Conservative Party's program promises to lower income tax for everyone by 10%."

print(f"\nChecking Claim: '{claim1}'")
response = rag_chain.invoke(claim1)
print(response.content)

Creating new vector store. This will happen only once.
Loading documents...

Processing document: /home/yslcoat/data/text_documents/arbeiderpartiets-partiprogram.pdf
Translating 116 pages via Gemini API. This may take a few minutes...
  - Translated page 1/116
  - Translated page 2/116
  - Translated page 3/116
  - Translated page 4/116
  - Translated page 5/116
  - Translated page 6/116
  - Translated page 7/116
  - Translated page 8/116
  - Translated page 9/116
  - Translated page 10/116
  - Translated page 11/116
  - Translated page 12/116
  - Translated page 13/116
  - Translated page 14/116
  - Translated page 15/116
  - Translated page 16/116
  - Translated page 17/116
  - Translated page 18/116
  - Translated page 19/116
  - Translated page 20/116
  - Translated page 21/116
  - Translated page 22/116
  - Translated page 23/116
  - Translated page 24/116
  - Translated page 25/116
  - Translated page 26/116
  - Translated page 27/116
  - Translated page 28/116
  - Translated pag