In [13]:
pip install -U langchain langchain-huggingface transformers chromadb sentence-transformers torch

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collec

In [3]:
import os
import pandas as pd

# Define the processed file path
processed_file = '/kaggle/working/articles_processed.pkl'

# Check if the dataset has already been processed and saved
if os.path.exists(processed_file):
    print("Loading previously processed dataset...")
    df = pd.read_pickle(processed_file)
else:
    print("Loading CSV and processing data...")
    # Load the CSV file (ensure the correct path)
    df = pd.read_csv('/kaggle/input/articles-content-cleaned-csv/articles_content_cleaned.csv')

    # Combine the relevant columns into a single text field
    df['combined_text'] = df['article_headline'] + ' ' + df['article_short_description'] + ' ' + df['article_text']

    # Function to create metadata
    def create_metadata(row):
        metadata = {}
        if 'article_domain' in df.columns:
            metadata['article_domain'] = str(row['article_domain']) if pd.notna(row['article_domain']) else ''
        if 'article_id' in df.columns:
            metadata['article_id'] = str(row['article_id']) if pd.notna(row['article_id']) else ''
        if 'article_url' in df.columns:
            metadata['article_url'] = str(row['article_url']) if pd.notna(row['article_url']) else ''
        return metadata

    df['metadata'] = df.apply(create_metadata, axis=1)

    # Save processed data to avoid reloading
    df.to_pickle(processed_file)
    print("Dataset processed and saved.")


Loading CSV and processing data...
Dataset processed and saved.


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Create Document objects with metadata
documents = [
    Document(page_content=text, metadata=meta)
    for text, meta in zip(df['combined_text'], df['metadata'])
]

# Split documents into smaller chunks
chunks = text_splitter.split_documents(documents)


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

# Use a local embedding model from Hugging Face (Free)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Alternative: OpenAI embeddings (if API key is available)
# from langchain_openai import OpenAIEmbeddings
# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002", api_key="your_openai_api_key")


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
from langchain_chroma import Chroma

# Initialize Chroma vector store
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    collection_name='fact_checker_collection',
    persist_directory='./chroma_db'  # Directory to persist the database
)


In [7]:
query = "F1 is not being held in Singapore"
results = vector_store.similarity_search(query, k=5)  # Retrieve top 5 matches

# Print retrieved documents
for i, doc in enumerate(results):
    print(f"Match {i+1}: {doc.page_content[:300]}")  # Show only first 300 chars


Match 1: The F1 Singapore Grand Prix takes place from Sep 20 to 22 at the Marina Bay Street Circuit.
Catch the Formula 1 Singapore Airlines Singapore Grand Prix 2024 practice, qualifying and main races on Channel 5 and mewatch.
Match 2: Ms Emily Prazer, F1 chief commercial officer, said: "Singapore has become one of the most revered grands prix on the calendar, and it is through the hard work and dedication of partners such as Singapore Airlines that we can continue to deliver such a strong event.
"We look forward to continuing to 
Match 3: Aerial activities to be temporarily restricted during F1 Singapore Grand Prix week These restrictions will be in force during certain time periods from Sep 14 to Sep 18. The Civil Aviation Authority of Singapore (CAAS) will be establishing temporary restricted areas (TRAs) over parts of the country 
Match 4: While there are more big events scheduled for the second half of the year, including the Formula 1 Singapore Grand Prix in September, it is bec

In [8]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
import torch

# Choose the model (small, base, large, xl, or xxl)
model_name = "google/flan-t5-large"  # Use "flan-t5-base" for lower RAM usage

# Load tokenizer and model
print("Loading model... (This may take a while initially)")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Create a text generation pipeline
llm_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if device == "cuda" else -1  # Use GPU if available
)

# Ensure `query` and `results` exist
if "query" not in locals() or "results" not in locals() or not results:
    raise ValueError("Error: 'query' or 'results' is missing or empty.")

print(query)
print({results[0].page_content})


Loading model... (This may take a while initially)


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


F1 is not being held in Singapore
{'The F1 Singapore Grand Prix takes place from Sep 20 to 22 at the Marina Bay Street Circuit.\nCatch the Formula 1 Singapore Airlines Singapore Grand Prix 2024 practice, qualifying and main races on Channel 5 and mewatch.'}


In [9]:

# Define input text
verification_prompt = f"""
You are a fact-checker. Analyze the following statement and determine whether it is true or false based on the provided document. Provide a clear and concise response structured as follows:

1. Verification: (True/False)
2. Explanation: (A detailed explanation based on the document)
3. Conclusion: (A summary of your analysis)

Example:
Statement: "AI is widely used in healthcare."
Document: "AI applications in healthcare include diagnostics, drug discovery, and patient monitoring."
Response:
1. Verification: True
2. Explanation: The document confirms that AI is used in healthcare for diagnostics, drug discovery, and patient monitoring, which supports the statement.
3. Conclusion: The statement is true based on the evidence provided.

Now, analyze the following statement and document:

Statement: "{query}"

Document:
{results[0].page_content}

Your response must strictly follow the structure above.
"""

# Generate a response
print("Generating response...")
output = llm_pipeline(
    verification_prompt,
    max_new_tokens=512,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True,
)

# Print full model output for debugging
print("Full Model Output:\n", output[0]['generated_text'])

# Post-process the output
response = output[0]['generated_text']

# Extract Verification
if "1. Verification:" in response:
    verification = response.split("1. Verification:")[1].split("2. Explanation:")[0].strip()
else:
    verification = "Not found"

# Extract Explanation
if "2. Explanation:" in response:
    explanation = response.split("2. Explanation:")[1].split("3. Conclusion:")[0].strip()
else:
    explanation = "Not found"

# Extract Conclusion
if "3. Conclusion:" in response:
    conclusion = response.split("3. Conclusion:")[1].strip()
else:
    conclusion = "Not found"

print("Verification:", verification)
print("Explanation:", explanation)
print("Conclusion:", conclusion)

Generating response...
Full Model Output:
 1. False 2. Explanation: F1 Singapore Grand Prix takes place in Singapore. 3. Conclusion: The statement is false.
Verification: Not found
Explanation: F1 Singapore Grand Prix takes place in Singapore.
Conclusion: The statement is false.
