In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/research-papers-corpus/AdapterFusion_Non-Destructive_Task_Composition_for_Transfer_Learning.pdf
/kaggle/input/research-papers-corpus/Parameter-Efficient_Transfer_Learning_with_Adapters.pdf
/kaggle/input/research-papers-corpus/Attention_Is_All_You_Need.pdf
/kaggle/input/research-papers-corpus/Transformers_for_Low-Resource_Languages_Is_Feidir_Linn.pdf
/kaggle/input/research-papers-corpus/Unsupervised_Cross-lingual_Representation_Learning_at_Scale.pdf
/kaggle/input/research-papers-corpus/XTREME-R_Towards_More_Challenging_and_Nuanced_Multilingual_Evaluation.pdf
/kaggle/input/research-papers-corpus/chrF_Character_n-gram_F-score_for_Automatic_MT_Evaluation.pdf
/kaggle/input/research-papers-corpus/Multilingual_Denoising_Pre-training_for_Neural_Machine_Translation.pdf
/kaggle/input/research-papers-corpus/BLEU_A_Method_for_Automatic_Evaluation_of_Machine_Translation.pdf
/kaggle/input/research-papers-corpus/Simple_Scalable_Adaptation_for_Neural_Machine_Translation.pdf
/kaggle/input

In [2]:
# Install dependencies
!pip install google-generativeai chromadb pypdf2 pandas rouge-score

Collecting chromadb
  Downloading chromadb-1.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (f

In [3]:
# import required libaries
import google.generativeai as genai
from pprint import pprint
import json
import os
import csv
import pandas as pd
from chromadb import Client
import textwrap
from rouge_score import rouge_scorer
from PyPDF2 import PdfReader
from kaggle_secrets import UserSecretsClient

In [4]:
# Load Gemini API key from Kaggle Secrets
try:
    user_secrets = UserSecretsClient()
    os.environ["GOOGLE_API_KEY"] = user_secrets.get_secret("GOOGLE_API_KEY")
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
except Exception as e:
    print(f"Error loading API key: {e}")
    raise

In [5]:
# Initialize Gemini model
try:
    model = genai.GenerativeModel("gemini-2.0-flash")
except Exception as e:
    print(f"Error initializing Gemini model: {e}")
    raise

In [6]:
# --- Utility Functions ---
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts text content from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text content from the PDF.

    Raises:
        FileNotFoundError: If the PDF file is not found at the specified path.
        Exception: If an error occurs during the PDF reading process.
    """
    text = ""
    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() or ""
        return text
    except FileNotFoundError:
        raise FileNotFoundError(f"PDF file not found: {pdf_path}")
    except Exception as e:
        raise Exception(f"Error reading PDF {pdf_path}: {e}")

def create_document(pdf_path: str, title: str, abstract: str) -> dict:
    """
    Creates a document dictionary containing the full text and metadata of a PDF.

    Args:
        pdf_path (str): The path to the PDF file.
        title (str): The title of the document.
        abstract (str): The abstract of the document.

    Returns:
        dict: A dictionary with keys 'content' (full text from PDF) and 'metadata'
              (a dictionary containing 'title', 'abstract', and 'filename').
    """
    full_text = extract_text_from_pdf(pdf_path)
    metadata = {"title": title, "abstract": abstract, "filename": os.path.basename(pdf_path)}
    return {"content": full_text, "metadata": metadata}

def extract_sample_pdf_text(pdf_path: str, max_chars: int = 6000) -> str:
    """
    Extracts a sample of text from the beginning of a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.
        max_chars (int, optional): The maximum number of characters to extract. Defaults to 6000.

    Returns:
        str: A string containing the first 'max_chars' characters of the PDF text,
             or an empty string if an error occurs.
    """
    try:
        full_text = extract_text_from_pdf(pdf_path)
        return full_text[:max_chars]
    except Exception as e:
        print(f"Error extracting sample text from {pdf_path}: {e}")
        return ""

In [7]:
# --- 1. Structured Output ---
def generate_structured_output(text: str) -> dict:
    """
    Generates a structured summary of a research paper from its text.

    Args:
        text (str): The text content of the research paper.

    Returns:
        dict: A dictionary containing the structured information extracted from the text,
              including 'title', 'authors', 'key_findings', 'methodology', and 'implications'.
              Returns an empty dictionary if JSON decoding fails.
    """
    prompt = f"""
    You are an expert at summarizing research papers. Extract the following information from the text provided:
    
    {text[:6000]}
    
    Format the output as a JSON object with the following keys:
    'title': The title of the paper.
    'authors': A list of the paper's authors (if available, else 'Not specified').
    'key_findings': A bulleted list of the most important findings.
    'methodology': A concise description of the main methods used.
    'implications': A brief summary of the significance and potential impact of the research.
    """
    response = model.generate_content(
        prompt,
        generation_config={"response_mime_type": "application/json"}
    )
    try:
        return json.loads(response.text)
    except json.JSONDecodeError:
        print(f"Error decoding JSON: {response.text}")
        return {}

In [8]:
# --- 2. Few-Shot Prompting ---
def few_shot_summary(sample_abstracts: list, current_text: str) -> str:
    """
    Generates a concise 1-sentence summary of a research paper using few-shot prompting.

    Args:
        sample_abstracts (list): A list of dictionaries, where each dictionary contains
                                  at least an 'abstract' key with a sample abstract.
        current_text (str): The text content of the research paper to summarize.

    Returns:
        str: A 1-sentence summary of the provided research paper content.
    """
    prompt = f"""
    You are a research assistant skilled at summarizing papers. Below are example summaries of research papers related to multilingual language models and evaluation. Each summary is concise (1 sentence) and focuses on the main contribution:
    
    Example 1:
    Abstract: {sample_abstracts[0]['abstract'][:180]}...
    Summary: This paper proposes ethical guidelines for AI systems to ensure fairness and transparency.
    
    Example 2:
    Abstract: {sample_abstracts[1]['abstract'][:180]}...
    Summary: This paper advocates adopting machine translation evaluation methods to improve multilingual LLM evaluation.
    
    Now, provide a concise 1-sentence summary of the following research paper, focusing on its main contribution:
    
    Content: {current_text[:2000]}...
    
    Summary:
    """
    response = model.generate_content(prompt)
    return response.text.strip()

In [9]:
# --- 3. Document Understanding ---
def document_understanding(text: str) -> str:
    """
    Analyzes research paper text to extract key findings, methodology, and a simplified explanation.

    Args:
        text (str): The text content of the research paper.

    Returns:
        str: A string containing a structured breakdown of the paper, including
             'Key Findings', 'Methodology', and 'In Simpler Terms'.
    """
    prompt = f"""
    Analyze the following research paper text and provide a structured breakdown:
    
    {text[:6000]}
    
    Include:
    **Key Findings:** A bulleted list of the most important findings.
    **Methodology:** A brief paragraph explaining the primary research methods used.
    **In Simpler Terms:** A concise paragraph explaining the paper's main argument to a non-expert.
    """
    response = model.generate_content(prompt)
    return response.text.strip()

In [10]:
# --- 4. Agents (Simulated with Direct Function Call) ---
def agent_retrieve_paper(query: str, chroma_client: Client, k: int = 1) -> str:
    """
    Simulates an agent retrieving a relevant research paper from a vector store based on a query.

    Args:
        query (str): The search query to find relevant papers.
        chroma_client (Client): The ChromaDB client connected to the vector store.
        k (int, optional): The number of top results to retrieve. Defaults to 1.

    Returns:
        str: A string containing information about the retrieved paper (content preview and metadata)
             if found, "Vector store not initialized." if the client is None, or
             "No relevant papers found." if no results are returned.
    """
    if not chroma_client:
        return "Vector store not initialized."
    results = vector_search_improved(chroma_client, query, k=k)
    if results:
        top_result = results[0]
        return f"""
        Retrieved Paper:
        Content Preview: {textwrap.shorten(top_result['document'], width=600, placeholder='...')}
        Metadata: {json.dumps(top_result['metadata'], indent=2)}
        """
    return "No relevant papers found."

In [11]:
# --- 5. Long Context Window ---
def long_context_summary(text: str) -> str:
    """
    Generates a detailed summary of a research paper, designed for longer text inputs.

    Args:
        text (str): The full text content of the research paper.

    Returns:
        str: A detailed summary of the key findings and implications of the paper.
    """
    prompt = f"""
    Summarize the key findings and implications of the following research paper in detail:
    
    {text[:10000]}
    """
    response = model.generate_content(prompt)
    return response.text.strip()

In [12]:
# --- 6. Context Caching (Simulated) ---
def context_caching_summary(text: str) -> str:
    """
    Simulates context caching by generating a concise summary of the research paper.

    Args:
        text (str): The text content of the research paper.

    Returns:
        str: A concise summary of the core argument and key contributions of the paper.
    """
    prompt = f"""
    Provide a concise summary of the core argument and key contributions of the following research paper:
    
    {text[:6000]}
    """
    response = model.generate_content(prompt)
    return response.text.strip()

In [13]:
# --- 7. Gen AI Evaluation ---
def evaluate_summary(ground_truth: str, model_summary: str) -> dict:
    """
    Evaluates a generated summary against a ground truth summary using ROUGE scores.

    Args:
        ground_truth (str): The reference (ground truth) summary.
        model_summary (str): The summary generated by the language model.

    Returns:
        dict: A dictionary containing ROUGE scores (rouge1 and rougeL) as dictionaries
              with 'precision', 'recall', and 'fmeasure'.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(ground_truth, model_summary)
    return {metric: score._asdict() for metric, score in scores.items()}

In [14]:
# --- 8. Grounding ---
def grounding_analysis(text: str) -> str:
    """
    Analyzes research paper text to identify and explain references to prior work.

    Args:
        text (str): The text content of the research paper.

    Returns:
        str: A string explaining how the paper builds upon, compares to, or differs
             from existing research, benchmarks, or datasets mentioned in the text.
    """
    prompt = f"""
    Analyze the following research paper text. Identify mentions of existing research, benchmarks, or datasets. Explain how the paper builds upon, compares to, or differs from these prior works:
    
    {text[:6000]}
    """
    response = model.generate_content(prompt)
    return response.text.strip()

In [15]:
# --- 9. Embeddings ---
def generate_embeddings(text_list: list[str]) -> list[list[float]]:
    """
    Generates embeddings for a list of text strings using the specified embedding model.

    Args:
        text_list (list[str]): A list of text strings to embed.

    Returns:
        list[list[float]]: A list of embeddings, where each embedding is a list of floats
                           corresponding to the input text.
    """
    embeddings = []
    for text in text_list:
        response = genai.embed_content(
            model="models/embedding-001",
            content=text[:8192],  # Limit to max input length
            task_type="retrieval_document"
        )
        embeddings.append(response["embedding"])
    return embeddings

In [16]:
# --- 10. RAG (Retrieval-Augmented Generation) ---
def rag_query(query: str, chroma_client: Client, k: int = 3) -> str:
    """
    Performs a Retrieval-Augmented Generation (RAG) query.

    Args:
        query (str): The user's query.
        chroma_client (Client): The ChromaDB client connected to the vector store.
        k (int, optional): The number of top relevant documents to retrieve. Defaults to 3.

    Returns:
        str: An answer to the query generated based on the retrieved relevant documents.
             Returns "Vector store not initialized." if the client is None, or
             "No relevant documents found for the query." if no results are retrieved.
    """
    if not chroma_client:
        return "Vector store not initialized."
    results = vector_search_improved(chroma_client, query, k=k)
    if not results:
        return "No relevant documents found for the query."
    context = "\n\n".join([f"Paper Excerpt:\n{textwrap.shorten(res['document'], width=800, placeholder='...')}" for res in results])
    prompt = f"""
    Based on the following research paper excerpts, answer the query: '{query}'
    
    Context:
    {context}
    
    Answer:
    """
    response = model.generate_content(prompt)
    return response.text.strip()

In [17]:
# --- 11. Vector Search (Improved) ---
def initialize_vector_store(csv_path: str, pdf_folder: str) -> Client:
    """
    Initializes a ChromaDB vector store with research papers from a CSV metadata file
    and corresponding PDF files.

    Args:
        csv_path (str): The path to the CSV file containing paper metadata (filename, title, abstract).
        pdf_folder (str): The path to the folder containing the PDF files.

    Returns:
        Client: The initialized ChromaDB client with the "research_papers" collection.
                Returns the client even if no valid documents are added.

    Raises:
        FileNotFoundError: If the CSV metadata file is not found.
        Exception: If an error occurs while reading the CSV file.
    """
    try:
        corpus_metadata = pd.read_csv(csv_path, quoting=csv.QUOTE_ALL, on_bad_lines="warn")
    except Exception as e:
        print(f"Error loading corpus_metadata.csv: {e}")
        raise
    documents_data = []
    for index, row in corpus_metadata.iterrows():
        filename_raw = row['filename']
        title = row['title']
        abstract = row['abstract']
        if isinstance(filename_raw, str):
            filename = filename_raw.strip()
            filepath = os.path.join(pdf_folder, filename)
            if os.path.exists(filepath):
                try:
                    doc = create_document(filepath, title, abstract)
                    documents_data.append(doc)
                except Exception as e:
                    print(f"Warning: Error processing {filepath}: {e}")
            else:
                print(f"Warning: PDF file does not exist: {filepath}")
        else:
            print(f"Warning: Skipping row {index} due to invalid filename: {filename_raw}")
    client = Client()
    collection = client.get_or_create_collection("research_papers")
    texts = [d['content'] for d in documents_data]
    metadatas = [d['metadata'] for d in documents_data]
    if not texts:
        print("Warning: No valid documents to add to vector store.")
        return client
    embeddings_list = generate_embeddings(texts)
    collection.add(
        documents=texts,
        metadatas=metadatas,
        embeddings=embeddings_list,
        ids=[f"doc_{i}" for i in range(len(documents_data))]
    )
    return client

def vector_search_improved(chroma_client: Client, query: str, k: int = 4) -> list[dict]:
    """
    Performs a semantic vector search on the ChromaDB vector store.

    Args:
        chroma_client (Client): The ChromaDB client connected to the vector store.
        query (str): The search query string.
        k (int, optional): The number of top results to retrieve. Defaults to 4.

    Returns:
        list[dict]: A list of dictionaries, where each dictionary contains information
                     about a retrieved document, including its 'id', 'distance', 'document'
                     content, and associated 'metadata'.
    """
    collection = chroma_client.get_collection("research_papers")
    response = genai.embed_content(
        model="models/embedding-001",
        content=query,
        task_type="retrieval_query"
    )
    query_embedding = response["embedding"]
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k
    )
    return [
        {
            "id": results['ids'][0][i],
            "distance": results['distances'][0][i],
            "document": results['documents'][0][i],
            "metadata": results['metadatas'][0][i]
        }
        for i in range(len(results['ids'][0]))
    ]

In [18]:
# --- 12. MLOps Monitoring ---
def monitor_performance(scores: dict) -> str:
    """
    Monitors the performance of a summarization model based on ROUGE scores.

    Args:
        scores (dict): A dictionary of ROUGE scores, typically the output of
                       the `evaluate_summary` function. Expected to contain
                       'rougeL' with an 'fmeasure'.

    Returns:
        str: A message indicating whether the performance is within acceptable limits
             or below the threshold. Returns an error message if the ROUGE scores
             are not in the expected format.
    """
    try:
        rouge_l_f1 = scores.get("rougeL", {}).get("fmeasure", 0)
        if rouge_l_f1 < 0.45:
            return "Performance below acceptable threshold (ROUGE-L F1 < 0.45). Consider further investigation or retraining."
        return "Performance within acceptable limits (ROUGE-L F1 >= 0.45)."
    except KeyError:
        print("Error: ROUGE scores not in expected format.")
        return "Performance evaluation failed."

In [19]:
# --- Main Execution ---
def main():
    csv_path = "/kaggle/input/metadata/corpus_metadata.csv"
    pdf_folder = "/kaggle/input/research-papers-corpus"
    sample_paper_folder = "/kaggle/input/sample-paper"
    paper_file = "Deja_Vu_Multilingual_LLM_Evaluation_through_the_Lens_of_Machine_Translation_Evaluation.pdf"
    paper_path = os.path.join(sample_paper_folder, paper_file)

    # Verify file paths
    if not os.path.exists(paper_path):
        print(f"Error: Sample paper not found at {paper_path}. Please ensure the path is correct.")
        return
    if not os.path.exists(csv_path):
        print(f"Error: corpus_metadata.csv not found at {csv_path}.")
        return

    # Initialize vector store
    vector_client = initialize_vector_store(csv_path, pdf_folder)

    # Extract sample paper text
    sample_paper_text = extract_sample_pdf_text(paper_path, max_chars=6000)

    # 1. Structured Output
    print("\n--- 1. Structured Output ---")
    structured_output = generate_structured_output(sample_paper_text)
    print(json.dumps(structured_output, indent=2))

    # 2. Few-Shot Prompting
    print("\n--- 2. Few-Shot Prompting ---")
    corpus_metadata = pd.read_csv(csv_path, quoting=csv.QUOTE_ALL, on_bad_lines="warn")
    sample_abstracts = corpus_metadata[['title', 'abstract']].head(2).to_dict('records')
    few_shot_summary_result = few_shot_summary(sample_abstracts, sample_paper_text)
    print(f"Summary: {few_shot_summary_result}")

    # 3. Document Understanding
    print("\n--- 3. Document Understanding ---")
    document_understanding_result = document_understanding(sample_paper_text)
    print(document_understanding_result)

    # 4. Agents
    print("\n--- 4. Agents (Simulated) ---")
    agent_query = "Multilingual LLM evaluation methods"
    agent_retrieval_result = agent_retrieve_paper(agent_query, vector_client)
    print(agent_retrieval_result)

    # 5. Long Context Window
    print("\n--- 5. Long Context Window ---")
    long_context_result = long_context_summary(extract_text_from_pdf(paper_path))
    print(long_context_result)

    # 6. Context Caching
    print("\n--- 6. Context Caching ---")
    context_caching_result = context_caching_summary(sample_paper_text)
    print(context_caching_result)

    # 7. Gen AI Evaluation
    print("\n--- 7. Gen AI Evaluation ---")
    ground_truth_summary = "This paper advocates adopting machine translation evaluation practices to enhance multilingual LLM evaluation with a meta-evaluation checklist."
    model_summary_for_eval = few_shot_summary(sample_abstracts, sample_paper_text)
    evaluation_scores = evaluate_summary(ground_truth_summary, model_summary_for_eval)
    print(f"Evaluation Scores:\n{json.dumps(evaluation_scores, indent=2)}")

    # 8. Grounding
    print("\n--- 8. Grounding ---")
    grounding_result = grounding_analysis(sample_paper_text)
    print(grounding_result)

    # 9. Embeddings
    print("\n--- 9. Embeddings ---")
    sample_embedding_text = sample_paper_text[:8192]
    embeddings_result = generate_embeddings([sample_embedding_text])
    print(f"Embeddings for sample paper (first 10 values):\n{embeddings_result[0][:10]}")

    # 10. RAG
    print("\n--- 10. RAG (Retrieval-Augmented Generation) ---")
    rag_query_str = "Evaluation metrics for multilingual LLMs"
    rag_result = rag_query(rag_query_str, vector_client)
    print(rag_result)

    # 11. Vector Search
    print("\n--- 11. Vector Search (Improved) ---")
    search_query = "multilingual evaluation"
    search_results = vector_search_improved(vector_client, search_query)
    for i, result in enumerate(search_results):
        print(f"Result {i+1}:")
        print(f"ID: {result['id']}")
        print(f"Distance: {result['distance']}")
        print(f"Content Preview: {textwrap.shorten(result['document'], width=300, placeholder='...')}")
        print(f"Metadata: {json.dumps(result['metadata'], indent=2)}")
        print("-" * 20)
    vector_client.delete_collection("research_papers")  # Clean up

    # 12. MLOps Monitoring
    print("\n--- 12. MLOps Monitoring ---")
    monitoring_result = monitor_performance(evaluation_scores)
    print(f"Monitoring Result: {monitoring_result}")

In [20]:
if __name__ == "__main__":
    main()


--- 1. Structured Output ---
{
  "title": "D\u00e9j\u00e0 Vu: Multilingual LLM Evaluation through the Lens of Machine Translation Evaluation",
  "authors": [
    "Julia Kreutzer",
    "Eleftheria Briakou",
    "Sweta Agrawal",
    "Marzieh Fadaee",
    "Kocmi Tom"
  ],
  "key_findings": [
    "Current generative evaluation approaches for multilingual models lack nuances in reporting, reproducibility, standardization, robustness and reliability, and most notably, meta-evaluation.",
    "Multilingual models shine especially in generative tasks, outperforming monolingual models across the bench.",
    "Many existing benchmarks have reached saturation and are not sufficiently separating models, making them unreliable predictors of generative abilities of mLLMs.",
    "Identified five concrete evaluation principles lacking in mLLM evaluations but established in MT.",
    "Established prerequisites necessary for meta-evaluations of mLLMs."
  ],
  "methodology": "The paper draws parallels be