## Embeddings and Vectorstore

Huggingface embeddings is used, since you don't need an API key

FAISS is used for the vectorstore, which is stored locally.

First, create a vector store with embeddings from the excell with all checked facts (FACTor)

In [1]:
import pandas as pd
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import numpy as np

# --- load ---
df = pd.read_csv("Data/FACTors.csv")

# avoid NaN -> empty string
df = df.replace({np.nan: ""})

def to_doc(row):
    # IMPORTANT: do NOT put URL in page_content
    page = (
        f"Title: {row['title']}\n"
        f"Claim: {row['claim']}\n"
        f"Date published: {row['date_published']}\n"
        f"Author: {row['author']}\n"
        f"Organisation: {row['organisation']}\n"
        f"Original Verdict: {row['original_verdict']}\n"
        f"Normalized Rating: {row['normalised_rating']}"
    )
    return Document(
        page_content=page,
        metadata={
            # keep url authoritative in metadata only
            "url": str(row.get("url", "")).strip(),
            # (optional) keep other fields here too for rendering
            "title": str(row.get("title", "")).strip(),
            "date_published": str(row.get("date_published", "")).strip(),
            "organisation": str(row.get("organisation", "")).strip(),
        },
    )

documents = [to_doc(row) for _, row in df.iterrows()]

# split (metadata is preserved)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=100)
split_docs = text_splitter.split_documents(documents)

# embed + index
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cuda'},          # or 'cpu'
    encode_kwargs={'normalize_embeddings': False}
)
vectorstore = FAISS.from_documents(split_docs, embeddings)
vectorstore.save_local("faiss_index")


Add more excelsheets with information on the organisations and authors publishing the claims

In [None]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# Load existing FAISS index 
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# Load the csv files
author_df = pd.read_csv("Data/author_stats.csv")
org_df = pd.read_csv("Data/org_stats.csv")

# Convert files into documents
def process_sheet1(df):
    documents = []
    for _, row in df.iterrows():
        combined_text = (
            f"Title: {row['title']}\n"
            f"Claim: {row['claim']}\n"
            f"Date published: {row['date_published']}\n"
            f"Author: {row['author']}\n"
            f"Organisation: {row['organisation']}\n"
            f"Original Verdict: {row['original_verdict']}\n"
            f"Normalized Rating: {row['normalised_rating']}"
        )
        documents.append(Document(page_content=combined_text))
    return documents

# --- Convert Sheet 2 (different structure, e.g., metadata and comments) ---
def process_sheet2(df):
    documents = []
    for _, row in df.iterrows():
        combined_text = (
            f"Statement: {row['statement']}\n"
            f"Source: {row['source']}\n"
            f"Reviewer Comment: {row['comment']}\n"
            f"Published On: {row['published_date']}"
        )
        documents.append(Document(page_content=combined_text))
    return documents

# --- Combine all new documents ---
all_new_documents = process_sheet1(sheet1_df) + process_sheet2(sheet2_df)

# --- Split into chunks ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(all_new_documents)

# --- Add to vectorstore and save ---
vectorstore.add_documents(split_docs)
vectorstore.save_local("faiss_index")
