In [None]:
!pip install langchain langchain-openai langchain-chroma pandas

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('../data/articles_content_cleaned.csv')

# Combine the relevant columns into a single text field
df['combined_text'] = df['article_headline'] + ' ' + df['article_short_description'] + ' ' + df['article_text']

# Optional: If 'summary' and 'domain' columns exist, include them in metadata
if 'summary' in df.columns and 'domain' in df.columns:
    df['metadata'] = df.apply(lambda row: {'summary': row['summary'], 'domain': row['domain']}, axis=1)
else:
    df['metadata'] = [{}] * len(df)


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Create Document objects with metadata
documents = [
    Document(page_content=text, metadata=meta)
    for text, meta in zip(df['combined_text'], df['metadata'])
]

# Split documents into chunks
chunks = text_splitter.split_documents(documents)


In [None]:
from langchain_openai import OpenAIEmbeddings

# Initialize the OpenAI embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key="") #Include API_KEY

# Note: Embedding generation will be handled during vector store creation

In [None]:
from langchain_chroma import Chroma

# Initialize Chroma vector store
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection_name='fact_checker_collection',
    persist_directory='./chroma_db'  # Directory to persist the database
)