In [None]:
!pip install langchain langchain-openai langchain-chroma pandas

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../data/articles_content_cleaned.csv')

# Combine the relevant columns into a single text field
df['combined_text'] = df['article_headline'] + ' ' + df['article_short_description'] + ' ' + df['article_text']

# Include 'article_domain', 'article_id', and 'article_url' in metadata if any of them exist
def create_metadata(row):
    metadata = {}
    if 'article_domain' in df.columns:
        metadata['article_domain'] = str(row['article_domain']) if pd.notna(row['article_domain']) else ''
    if 'article_id' in df.columns:
        metadata['article_id'] = str(row['article_id']) if pd.notna(row['article_id']) else ''
    if 'article_url' in df.columns:
        metadata['article_url'] = str(row['article_url']) if pd.notna(row['article_url']) else ''
    return metadata

df['metadata'] = df.apply(create_metadata, axis=1)


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Create Document objects with metadata
documents = [
    Document(page_content=text, metadata=meta)
    for text, meta in zip(df['combined_text'], df['metadata'])
]

# Split documents into chunks
chunks = text_splitter.split_documents(documents)


In [None]:
from langchain_openai import OpenAIEmbeddings

# Initialize the OpenAI embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key="") #Include API_KEY

# Note: Embedding generation will be handled during vector store creation

In [None]:
from langchain_chroma import Chroma

# Initialize Chroma vector store
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection_name='fact_checker_collection',
    persist_directory='./chroma_db'  # Directory to persist the database
)