# Data Acquisition (Research Papers)

In [8]:
!pip install requests xmltodict

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [9]:
!pip install requests PyPDF2 nltk spacy gensim textblob

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [18]:
# ==============================================================================
# 1. IMPORTS AND SETUP
# ==============================================================================
import requests
import json
import time
import os
import xmltodict
import PyPDF2
import nltk
import spacy
from textblob import TextBlob
from gensim import corpora, models
import sqlite3
import logging

# Configure logging for clear output
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# ==============================================================================
# 2. CONSTANTS AND CONFIGURATION
# ==============================================================================
ARXIV_API_URL = "http://export.arxiv.org/api/query"
PDF_DIR = "arxiv_pdfs"
DB_FILE = "arxiv_papers.db"
os.makedirs(PDF_DIR, exist_ok=True)

# Define search keywords
SEARCH_KEYWORDS = [
    "alpha factors", 'alpha generation', 'alpha mining', 'factor investing', 
    'formulaic alphas', 'stock price prediction', 'artifical intelligence trading', 
    'AI stock selection', 'large language models finance', 'reinforcement learning portfolio', 
    'portfolio optimization', 'trading strategies', 'algorithmic trading', 
    'quantitative trading', 'mixture of experts trading'
]

# ==============================================================================
# 3. NLTK AND SPACY MODEL LOADING
# ==============================================================================
# Download NLTK data quietly
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt', quiet=True)
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords', quiet=True)

# Load spaCy model, downloading if necessary
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    logging.info("Downloading spaCy model 'en_core_web_sm'...")
    os.system(f"python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

STOP_WORDS = set(stopwords.words('english'))

# ==============================================================================
# 4. CORE FUNCTIONS (DECOMPOSED AND REFACTORED)
# ==============================================================================

def parse_arxiv_entry(entry: dict) -> dict:
    """Parses a single raw XML entry from arXiv into a clean dictionary."""
    paper = {}
    paper['id'] = entry.get('id', '').split('/')[-1] if entry.get('id') else None
    paper['title'] = entry.get('title', 'No Title').replace('\n', ' ').strip()
    paper['abstract'] = entry.get('summary', 'No Abstract').replace('\n', ' ').strip()
    
    # Handle both single author (dict) and multiple authors (list of dicts)
    authors = entry.get('author', [])
    if isinstance(authors, dict):
        authors = [authors]
    paper['authors'] = [author.get('name') for author in authors if isinstance(author, dict)]
    
    paper['categories'] = [cat.get('@term') for cat in entry.get('category', []) if isinstance(cat, dict)]
    # FIX: Use underscore to match database schema convention
    paper['journal_ref'] = entry.get('arxiv:journal_ref', {}).get('#text')
    paper['doi'] = entry.get('arxiv:doi', {}).get('#text')
    paper['published'] = entry.get('published')
    return paper

def fetch_arxiv_metadata(query: str, max_results: int = 5, retry_count: int = 3) -> list:
    """
    Fetches paper metadata from the arXiv API for a given query.
    Handles network errors with retries and exponential backoff.
    """
    params = {
        "search_query": f'all:"{query}"',
        "start": 0,
        "max_results": max_results,
    }
    for attempt in range(retry_count):
        try:
            response = requests.get(ARXIV_API_URL, params=params)
            response.raise_for_status()  # Raises HTTPError for bad responses (4xx or 5xx)
            
            xml_dict = xmltodict.parse(response.content)
            entries = xml_dict.get('feed', {}).get('entry', [])
            
            logging.info(f"Successfully fetched {len(entries)} entries for query: '{query}'")
            return [parse_arxiv_entry(e) for e in entries]

        except requests.exceptions.RequestException as e:
            logging.warning(f"Network error on attempt {attempt + 1} for '{query}': {e}")
        except (xmltodict.expat.ExpatError, KeyError) as e:
            logging.warning(f"XML parsing error on attempt {attempt + 1} for '{query}': {e}")

        if attempt < retry_count - 1:
            time.sleep(2 ** attempt)  # Exponential backoff
        else:
            logging.error(f"Failed to fetch metadata for '{query}' after {retry_count} attempts.")
            return []
    return []


def download_pdf(pdf_url: str, paper_id: str) -> str | None:
    """Downloads a PDF given its URL and saves it locally. Returns the file path."""
    file_path = os.path.join(PDF_DIR, f"{paper_id}.pdf")
    try:
        response = requests.get(pdf_url, stream=True, timeout=30)
        response.raise_for_status()
        with open(file_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        logging.info(f"Downloaded PDF: {file_path}")
        return file_path
    except requests.exceptions.RequestException as e:
        logging.error(f"Error downloading PDF for paper {paper_id}: {e}")
        return None


def extract_text_from_pdf(pdf_path: str) -> str | None:
    """Extracts text content from a PDF."""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
            return text if text.strip() else None
    except (FileNotFoundError, PyPDF2.errors.PdfReadError) as e:
        logging.error(f"Error extracting text from PDF {pdf_path}: {e}")
        return None


def extract_keywords(text: str) -> list:
    """Extracts keywords from text using spaCy for better linguistic understanding."""
    if not text:
        return []
    doc = nlp(text.lower())
    keywords = [token.lemma_ for token in doc if token.pos_ in ("NOUN", "PROPN", "ADJ") and not token.is_stop and token.is_alpha]
    # Return unique keywords while preserving order
    return list(dict.fromkeys(keywords))[:20]


def summarize_text(text: str, num_sentences: int = 3) -> str:
    """Summarizes text by taking the first few sentences."""
    if not text:
        return ""
    doc = nlp(text)
    sentences = list(doc.sents)
    return "".join(sent.text for sent in sentences[:num_sentences]) if sentences else text


def analyze_sentiment(text: str) -> float | None:
    """Analyzes sentiment using TextBlob. Returns polarity score or None."""
    if not text:
        return None # FIX: Return None for missing data to store as NULL
    analysis = TextBlob(text)
    return analysis.sentiment.polarity


def init_database(conn: sqlite3.Connection):
    """Creates the 'papers' table if it doesn't exist."""
    cursor = conn.cursor()
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id TEXT PRIMARY KEY,
            title TEXT,
            abstract TEXT,
            authors TEXT,
            categories TEXT,
            journal_ref TEXT,
            doi TEXT,
            published TEXT,
            pdf_path TEXT,
            full_text TEXT,
            keywords TEXT,
            summary TEXT,
            sentiment REAL
        )
    ''')
    conn.commit()


def store_paper_in_db(paper_data: dict, conn: sqlite3.Connection):
    """Stores a single processed paper dictionary into the database."""
    cursor = conn.cursor()
    columns = [
        'id', 'title', 'abstract', 'authors', 'categories', 'journal_ref', 'doi',
        'published', 'pdf_path', 'full_text', 'keywords', 'summary', 'sentiment'
    ]
    
    placeholders = ", ".join(["?"] * len(columns))
    query = f"INSERT OR REPLACE INTO papers ({', '.join(columns)}) VALUES ({placeholders})"
    
    values = []
    for col in columns:
        value = paper_data.get(col)
        if isinstance(value, list):
            value = json.dumps(value)  # Serialize lists to JSON strings
        values.append(value)

    try:
        cursor.execute(query, tuple(values))
        conn.commit()
        logging.info(f"Stored/Updated paper {paper_data['id']} in the database.")
    except sqlite3.Error as e:
        logging.error(f"Database error for paper {paper_data.get('id')}: {e}")
        conn.rollback()

# ==============================================================================
# 5. MAIN ORCHESTRATION FUNCTION
# ==============================================================================

def main():
    """
    Main execution function to orchestrate the entire data pipeline.
    """
    # ======================================================================
    # === DATABASE RESET LOGIC (GUARANTEES A FRESH START) ===
    # This block solves the "table has no column" error by deleting the
    # old database file before a new one is created.
    if os.path.exists(DB_FILE):
        logging.warning(
            f"Found existing database '{DB_FILE}'. Deleting it to ensure the correct schema is applied."
        )
        os.remove(DB_FILE)
    # ======================================================================

    all_processed_papers = []
    
    # Use a try...finally block to ensure the database connection is closed
    conn = sqlite3.connect(DB_FILE)
    try:
        init_database(conn) # Ensure table exists
        
        for keyword in SEARCH_KEYWORDS:
            logging.info(f"--- Starting search for keyword: '{keyword}' ---")
            paper_metadata_list = fetch_arxiv_metadata(keyword, max_results=5)
            
            for metadata in paper_metadata_list:
                paper_id = metadata['id']
                if not paper_id:
                    logging.warning("Skipping entry with no ID.")
                    continue

                # --- Process a single paper ---
                logging.info(f"Processing paper ID: {paper_id}")
                pdf_url = f"https://arxiv.org/pdf/{paper_id}.pdf"
                metadata['pdf_path'] = download_pdf(pdf_url, paper_id)

                if metadata['pdf_path']:
                    metadata['full_text'] = extract_text_from_pdf(metadata['pdf_path'])
                
                if metadata.get('full_text'):
                    # Perform NLP analysis
                    full_text = metadata['full_text']
                    metadata['keywords'] = extract_keywords(full_text)
                    metadata['summary'] = summarize_text(full_text)
                    metadata['sentiment'] = analyze_sentiment(full_text)
                    
                    # Store the complete paper record
                    store_paper_in_db(metadata, conn)
                    all_processed_papers.append(metadata)
                else:
                    logging.warning(f"Could not extract text for paper {paper_id}. Skipping NLP and storage.")

    finally:
        logging.info("Closing database connection.")
        conn.close()

    # --- Final Analysis: Topic Modeling ---
    logging.info("--- Starting Topic Modeling on all collected papers ---")
    # FIX: Use the complete list `all_processed_papers`
    texts = [p.get('full_text') for p in all_processed_papers if p.get('full_text')]

    if texts:
        tokenized_texts = [[word for word in text.lower().split() if word.isalnum() and word not in STOP_WORDS] for text in texts]
        dictionary = corpora.Dictionary(tokenized_texts)
        corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
        
        if corpus:
            lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)
            topics = lda_model.print_topics(num_words=5)
            print("\n--- Identified Topics ---")
            for topic in topics:
                print(topic)
        else:
            print("\nCould not create a corpus for topic modeling (likely all documents were empty).")
    else:
        print("\nNo text available to perform topic modeling.")
        
    logging.info(f"\nPipeline finished. Processed a total of {len(all_processed_papers)} papers.")


# ==============================================================================
# 6. SCRIPT ENTRY POINT
# ==============================================================================
if __name__ == "__main__":
    main()

2025-07-15 19:22:24,290 - INFO - --- Starting search for keyword: 'alpha factors' ---
2025-07-15 19:22:25,495 - INFO - Successfully fetched 5 entries for query: 'alpha factors'
2025-07-15 19:22:25,500 - INFO - Processing paper ID: 2406.18394v5
2025-07-15 19:22:29,559 - INFO - Downloaded PDF: arxiv_pdfs/2406.18394v5.pdf
2025-07-15 19:22:32,129 - INFO - Stored/Updated paper 2406.18394v5 in the database.
2025-07-15 19:22:32,129 - INFO - Processing paper ID: 2409.05144v3
2025-07-15 19:22:34,416 - INFO - Downloaded PDF: arxiv_pdfs/2409.05144v3.pdf
2025-07-15 19:22:39,018 - INFO - Stored/Updated paper 2409.05144v3 in the database.
2025-07-15 19:22:39,018 - INFO - Processing paper ID: 1210.5413v1
2025-07-15 19:22:41,796 - INFO - Downloaded PDF: arxiv_pdfs/1210.5413v1.pdf
2025-07-15 19:22:43,262 - INFO - Stored/Updated paper 1210.5413v1 in the database.
2025-07-15 19:22:43,263 - INFO - Processing paper ID: 2401.02710v2
2025-07-15 19:22:45,237 - INFO - Downloaded PDF: arxiv_pdfs/2401.02710v2.pd


--- Identified Topics ---
(0, '0.012*"trading" + 0.008*"learning" + 0.006*"stock" + 0.005*"1" + 0.005*"monoidal"')
(1, '0.027*"alpha" + 0.009*"alphas" + 0.007*"search" + 0.006*"mining" + 0.006*"formulaic"')
(2, '0.010*"stock" + 0.010*"price" + 0.007*"portfolio" + 0.007*"trading" + 0.007*"model"')
(3, '0.006*"operator" + 0.004*"set" + 0.004*"points" + 0.003*"point" + 0.003*"shift"')
(4, '0.008*"model" + 0.006*"data" + 0.006*"portfolio" + 0.006*"probability" + 0.005*"time"')


In [20]:
import sqlite3
import os

DB_FILE = "arxiv_papers.db"

def check_paper_count():
    """Connects to the database and prints the number of papers."""
    if not os.path.exists(DB_FILE):
        print(f"Error: Database file '{DB_FILE}' not found.")
        return

    try:
        # Connect to the SQLite database
        conn = sqlite3.connect(DB_FILE)
        cursor = conn.cursor()

        # Execute the query to count all rows in the 'papers' table
        cursor.execute("SELECT COUNT(*) FROM papers")

        # Fetch the result. fetchone() returns a single tuple, e.g., (75,)
        count_result = cursor.fetchone()
        
        # The count is the first element of the tuple
        paper_count = count_result[0]

        print(f"Total number of papers stored in the database: {paper_count}")

    except sqlite3.Error as e:
        print(f"Database error: {e}")
    finally:
        # Make sure to close the connection
        if 'conn' in locals() and conn:
            conn.close()

if __name__ == "__main__":
    check_paper_count()

Total number of papers stored in the database: 48


# LlamaIndex & Ollama: Multimodal RAG Agent for Financial Alpha Generation from PDFs (for Phase I)

In [1]:
!pip install llama-index
!pip install llama-index-core llama-index-readers-file llama-index-llms-ollama llama-index-embeddings-huggingface
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding



In [2]:
!pip install tf_keras
!pip install frontend
!pip install PyMuPDF --force-reinstall
!pip install torch torchvision sentence-transformers transformers ollama langchain
!pip install llama-index-multi-modal-llms-ollama llama-index-embeddings-clip Pillow
!pip install llama-index llama-index-core llama-index-multi-modal-llms-ollama llama-index-embeddings-clip llama-index-embeddings-huggingface
!pip install git+https://github.com/openai/CLIP.git
!pip install torch torchvision torchaudio
!pip install PyMuPDF torch torchvision clip-openai 
# Note: 'clip-openai' is the package installed via git previously. 'langchain' might be needed for the output parser.

Collecting PyMuPDF
  Using cached pymupdf-1.26.3-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached pymupdf-1.26.3-cp39-abi3-macosx_11_0_arm64.whl (22.4 MB)
Installing collected packages: PyMuPDF
  Attempting uninstall: PyMuPDF
    Found existing installation: PyMuPDF 1.26.3
    Uninstalling PyMuPDF-1.26.3:
      Successfully uninstalled PyMuPDF-1.26.3
Successfully installed PyMuPDF-1.26.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/v3/j0t3d9ws5bzdcq6f1g3c9thr0000gn/T/pip-req-build-o7c0x1q5
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/v3/j0t3d9ws5bzdcq6f1g3c9thr0000gn/T/pip-req-build-o7c0x1q5
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone


In [None]:
# (Checkpoint) To verify if the local Ollama LLM successfully installed and it's available on your computer
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

print("Ollama imported successfully!")  # If this prints, the import works

# If the import works, then try a minimal Ollama interaction:

# You can type this command (in the terminal of your computer) to use this LLM : ollama run llama3.2
try:
  llm = Ollama(model="llama4") # or another model available to you
  print("Ollama LLM instantiated!")
except Exception as e:
  print(f"Error instantiating Ollama: {e}")

Ollama imported successfully!
Ollama LLM instantiated!


In [None]:
# --- Imports ---
# (Keep all imports the same as the previous working version)
import torch
import asyncio
import json
import re
import base64
import io
import os
import glob
import traceback
import sys
import pkg_resources
import pymupdf
from PIL import Image
try:
    from langchain.output_parsers import StructuredOutputParser, ResponseSchema
except ImportError:
    # Dummy classes if langchain not installed
    class ResponseSchema:
        def __init__(self, name, description): self.name = name; self.description = description
    class StructuredOutputParser:
        @staticmethod
        def from_response_schemas(schemas):
            class DummyParser:
                 def get_format_instructions(self): return "Format instructions N/A."
            return DummyParser()

from llama_index.core import Settings, StorageContext, load_index_from_storage, VectorStoreIndex, Document
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core.schema import ImageDocument, ImageNode as LlamaIndexImageNode
from llama_index.multi_modal_llms.ollama import OllamaMultiModal
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.clip import ClipEmbedding
from typing import List, Dict, Sequence, Tuple

# --- Global Settings & Instantiation ---
# (Keep setup the same)
print("Setting up global settings...")
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
image_embed_model = ClipEmbedding(embed_batch_size=1)
print("Embedding models set.")

# --- DEFINE LLM PARAMETERS ---
LLM_MODEL = "llama4"  
LLM_MAX_NEW_TOKENS = 2000
LLM_TEMPERATURE = 0.1
LLM_TIMEOUT = 720.0

multi_modal_llm = OllamaMultiModal(
    model=LLM_MODEL,
    max_new_tokens=LLM_MAX_NEW_TOKENS,
    temperature=LLM_TEMPERATURE,
    request_timeout=LLM_TIMEOUT,
)

# --- Helper Function ---
# (Keep pil_image_to_base64 the same)
def pil_image_to_base64(pil_image: Image.Image, format="JPEG") -> str:
    buffered = io.BytesIO()
    if format == "PNG" and pil_image.mode in ("RGBA", "LA"):
        pil_image.save(buffered, format="PNG")
    elif pil_image.mode == "P":
        pil_image = pil_image.convert("RGB")
        pil_image.save(buffered, format="JPEG", quality=90)
    else:
        if pil_image.mode != "RGB":
            pil_image = pil_image.convert("RGB")
        pil_image.save(buffered, format="JPEG", quality=90)
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

# --- Functions ---

# (Keep process_pdf_for_multimodal the same as your last working version)
def process_pdf_for_multimodal(pdf_path) -> List:
    """ Extracts text/images from PDF using PyMuPDF with logging. """
    all_documents = []
    page_image_counts = {}
    total_images_processed = 0
    pdf_basename = os.path.basename(pdf_path)
    try:
        doc = pymupdf.open(pdf_path)
    except Exception as e:
        print(f"!!! ERROR opening PDF {pdf_basename}: {type(e).__name__}: {e}. Skipping.")
        return all_documents
    print(f"  Processing {doc.page_count} pages in {pdf_basename}...")
    for page_num, page in enumerate(doc):
        page_id = f"{pdf_basename}_page_{page_num+1}"
        shared_metadata = {"source_file": pdf_basename, "page_number": page_num + 1}
        images_on_page = 0
        try: # Text Extraction
            page_text = page.get_text(sort=True)
            if page_text.strip():
                all_documents.append(Document(text=page_text, id_=f"{page_id}_text", metadata=shared_metadata.copy()))
        except Exception as e: print(f"    Page {page_num+1}: Error extracting text: {e}")
        try: # Image Extraction
            image_list = page.get_images(full=True)
            # if image_list: print(f"    Page {page_num+1}: Found {len(image_list)} potential image(s).") # Optional log
            for img_index, img_info in enumerate(image_list):
                xref = img_info[0]
                if xref == 0: continue
                try:
                    base_image = doc.extract_image(xref)
                    if base_image and base_image.get("image"):
                        image_bytes_from_pdf = base_image["image"]
                        # print(f"      Page {page_num+1}, Img {img_index} (xref {xref}): Extracted {len(image_bytes_from_pdf)} bytes. Validating...") # Optional log
                        try:
                            img_io = io.BytesIO(image_bytes_from_pdf)
                            img_pil = Image.open(img_io); img_pil.verify()
                            # print(f"      Page {page_num+1}, Img {img_index} (xref {xref}): Validation OK.") # Optional log
                            all_documents.append(ImageDocument(image=image_bytes_from_pdf, id_=f"{page_id}_img_{img_index}", metadata=shared_metadata.copy()))
                            images_on_page += 1; total_images_processed += 1
                        except Exception as img_err: print(f"    ! Page {page_num+1}, Img {img_index} (xref {xref}): ERROR validating: {type(img_err).__name__}: {img_err}. Skipping.")
                    # else: print(f"    ! Page {page_num+1}, Img {img_index} (xref {xref}): No image data extracted.") # Optional log
                except Exception as extract_err: print(f"    ! Page {page_num+1}, Img {img_index} (xref {xref}): ERROR calling extract_image: {type(extract_err).__name__}: {extract_err}. Skipping.")
        except Exception as e: print(f"    ! Page {page_num+1}: ERROR during image loop: {type(e).__name__}: {e}")
        if images_on_page > 0: page_image_counts[page_num + 1] = images_on_page
    doc.close()
    print(f"  ---> Finished {pdf_basename}. Validated {total_images_processed} images. Pages: {list(page_image_counts.keys()) if page_image_counts else 'None'}")
    return all_documents

# (Keep prepare_multimodal_context_and_images the same as your last working version)
async def prepare_multimodal_context_and_images(query: str, query_engine):
    """ Retrieves context and image docs. """
    text_context = ""
    retrieved_image_docs = []
    try:
        retriever = query_engine.retriever
        top_k_val = getattr(retriever, 'similarity_top_k', 'Default') # Access from retriever
        print(f"--- Retrieving top {top_k_val} nodes for query: '{query[:100]}...' ---")
        retrieved_nodes = await retriever.aretrieve(query)
        print(f"--- Retrieved {len(retrieved_nodes)} nodes total ---")
        if not retrieved_nodes:
            print("Warning: No documents or images retrieved for the query.")
            return "No relevant context found.", []
        doc_texts = []
        for RNode in retrieved_nodes:
            node = RNode.node
            score_str = f"{RNode.score:.4f}" if RNode.score is not None else "N/A"
            source_info = f"Source: {node.metadata.get('source_file', 'Unknown')}, Page: {node.metadata.get('page_number', 'N/A')}, Score: {score_str}"
            if isinstance(node, (ImageDocument, LlamaIndexImageNode)):
                 retrieved_image_docs.append(node)
                 doc_texts.append(f"--- Retrieved Image ---\n[{source_info}]\n--- End Image ---")
                 print(f"    Found ImageNode: {node.id_} (Score: {score_str})")
            else:
                 try: node_content = node.get_content(metadata_mode='all')[:200] + "..."
                 except Exception: node_content = "[Could not get node content]..."
                 doc_texts.append(f"--- Retrieved Document ---\n[{node_content}]\n[{source_info}]\n--- End Document ---")
                 print(f"    Found TextNode: {node.id_} (Score: {score_str})")
        text_context = "\n\n".join(doc_texts).strip()
    except Exception as e:
        print(f"Error during retrieval: {e}"); traceback.print_exc()
        return "Error retrieving context.", []
    print(f"---> Retrieved {len(retrieved_image_docs)} images for context.")
    return text_context, retrieved_image_docs


# *************************************************************************
# *** MODIFIED generate_seed_alphas to de-emphasize context slightly ***
# *************************************************************************
async def generate_seed_alphas(
    query: str,
    query_engine,
    llm_instance,
    send_images_to_llm: bool = False # *** ADDED FLAG to control image sending ***
    ) -> Tuple[Dict, List]:
    """
    Generates seed alphas using Multimodal LLM with retrieved text and image context.
    Includes options to de-emphasize context and returns retrieved images list.
    Correctly handles Ollama response object.
    """
    parsed_data_result = {"alphas": []}
    retrieved_images_for_return = []

    # 1. Get context string and LlamaIndex ImageDocument objects
    context_str, retrieved_llamaindex_image_docs = await prepare_multimodal_context_and_images(query, query_engine)
    retrieved_images_for_return = retrieved_llamaindex_image_docs

    # 2. Define the expected JSON output structure (remains the same)
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects..."),
    ]

    # 3. Construct the LLM Prompt string
    # *** ADDED clarifying sentence in Instructions ***
    prompt_content = f"""Based on the following context documents, generate *high-quality, independent and unique seed alphas* related to: {query}.

            Context Documents (Use these for inspiration or validation, but prioritize the core instructions and examples below):
            ```
            {context_str if context_str else "No text context retrieved."}
            ```

            Instructions:
            1. Categorize the alphas into one of these financial domains: Momentum, Mean Reversion, Volatility, Fundamental, Liquidity, Quality, Growth, Technical, Macro Economics.
            2. Provide a descriptive 'name' for each alpha.
            3. Provide the calculation 'code' (formula) for each alpha.
            4. Focus on alphas factors suitable for *daily stock market data in Hong Kong*.
            5. The 'code' (formula) *MUST ONLY* use 'OPEN', 'LOW', 'HIGH', 'CLOSE', 'VOLUME', standard arithmetic/logical operators, or functions commonly found in libraries like TA-Lib (e.g., SMA, EMA, RSI, ATR, STD, MIN, MAX, DELAY, MEAN). Assume DELAY(X, n) means value of X n periods ago. Use standard function names.
            6. Ensure the 'name' of each alpha factor accurately reflects its 'code' (formula).
            7. Verify the correctness of each generated alpha factor's formula logic. Avoid trivial or redundant alphas.
            8. **Crucially, rely primarily on the examples below and the general task description to generate diverse alphas. Treat the 'Context Documents' above as supplementary.**
            9. **YOU MUST NOT GENERATE ANY PYTHON CODES.**

            Output Format:
            Return the result as a *single, valid JSON object (dictionary)*. The JSON object *must* strictly adhere to the following structure, with no extra text, comments, or explanations before or after the JSON block:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Example Domain",
                "name": "Example Alpha Name",
                "code": "Example Formula (e.g., CLOSE - DELAY(CLOSE, 1))"
                }},
                // ... more alpha objects can follow here
            ]
            }}
            ```

            Example Alphas (Illustrative - Generate new ones based on context and query):
            
            ```json
            {{
            "alphas": [
                {{
                "domain": "Momentum",
                "name": "Price Momentum (10 days)",
                "code": "((CLOSE - DELAY(CLOSE, 10)) / DELAY(CLOSE, 10))"
                }},
                {{
                "domain": "Momentum",
                "name": "Moving Average Crossover (10 vs 50 day)",
                "code": "(SMA(CLOSE, 10) - SMA(CLOSE, 50))"
                }},
                {{
                "domain": "Momentum",
                "name": "Volume Momentum",
                "code": "VOLUME - DELAY(VOLUME, 50))"
                }},
                {{
                "domain": "Mean Reversion",
                "name": "Mean Reversion (20 days)",
                "code": "(MEAN(CLOSE, 20) - CLOSE)"
                }},
                {{
                "domain": "Mean Reversion",
                "name": "Moving Average Reversion",
                "code": "(SMA(CLOSE, 20) - CLOSE)"
                }},
                {{
                "domain": "Mean Reversion",
                "name": "Stochastic Oscillator (%K, 14-day)",
                "code": "(CLOSE - MIN(LOW, 14)) / (MAX(HIGH, 14) - MIN(LOW, 14))"
                }},
                {{
                "domain": "Volatility",
                "name": "Average True Range (ATR, 14-day)",
                "code": "(ATR(14))"
                }},
                {{
                "domain": "Volatility",
                "name": "Daily High-Low Range",
                "code": "(HIGH - LOW)"
                }},
                {{
                "domain": "Volatility",
                "name": "Normalized Bollinger Band Width (20-day, 2 StdDev)",
                "code": "((SMA(CLOSE, 20) + 2 * STD(CLOSE, 20)) - (SMA(CLOSE, 20) - 2 * STD(CLOSE, 20))) / SMA(CLOSE, 20)"
                }},
                {{
                "domain": "Liquidity",
                "name": "Volume Rate of Change (VROC, 10-day)",
                "code": "((VOLUME / DELAY(VOLUME, 10)) - 1)"
                }},
                {{
                "domain": "Liquidity",
                "name": "Trading Volume",
                "code": "VOLUME"
                }},
                {{
                "domain": "Technical",
                "name": "Moving Average (MA)",
                "code": "SMA(CLOSE, 20)"
                }},
                {{
                "domain": "Technical",
                "name": "Exponential Moving Average (MA)",
                "code": "EMA(CLOSE, 20)"
                }},
                {{
                "domain": "Technical",
                "name": "Relative Strength Index (RSI)",
                "code": "RSI(14)"
                }},
                // ... more examples
            ]
            }}
            ```
            Ensure the entire response is only the valid JSON object shown above. If no relevant alphas can be generated, return an empty list within the JSON: {{"alphas": []}}. Ensure all keys and string values within the JSON are enclosed in double quotes. Do not add any introductory text like "Here is the JSON..." before the opening brace {{.
            """

    # 4. Prepare messages and images for the underlying ollama client
    ollama_messages = []
    base64_images = [] # Images encoded, ready to be potentially sent
    encoded_image_count = 0
    if retrieved_llamaindex_image_docs:
        print("--- Encoding retrieved images to base64... ---")
        # (Keep the working image decoding/encoding loop)
        for img_doc in retrieved_llamaindex_image_docs:
            # ... (logic to get img_bytes_to_encode from img_doc.image (str or bytes)) ...
            img_bytes_to_encode = None
            img_data_from_doc = img_doc.image
            if isinstance(img_data_from_doc, str):
                try:
                    if len(img_data_from_doc) < 260 and os.path.exists(img_data_from_doc): continue # Skip paths
                    img_bytes_to_encode = base64.b64decode(img_data_from_doc)
                except Exception: continue # Skip decode errors
            elif isinstance(img_data_from_doc, bytes):
                img_bytes_to_encode = img_data_from_doc
            else: continue # Skip other types

            if img_bytes_to_encode:
                try:
                    pil_img = Image.open(io.BytesIO(img_bytes_to_encode))
                    img_format = "JPEG" if pil_img.mode != "PNG" else "PNG"
                    b64_string = pil_image_to_base64(pil_img, format=img_format)
                    base64_images.append(b64_string) # Add to list of *potentially* sent images
                    encoded_image_count += 1
                except Exception as encode_err:
                    print(f"    Warning: Failed Pillow/base64 encoding for {img_doc.id_}: {encode_err}")
        print(f"--- Encoded {encoded_image_count} images successfully. ---")

    # Prepare user message content
    user_message_dict = { "role": "user", "content": prompt_content }

    # *** Conditionally add images based on the flag ***
    images_actually_sent = []
    if base64_images and send_images_to_llm:
        print(f"Attaching {len(base64_images)} base64 encoded images to the request (send_images_to_llm=True).")
        user_message_dict["images"] = base64_images
        images_actually_sent = base64_images # Track what was sent
    elif base64_images and not send_images_to_llm:
         print("INFO: Images were retrieved and encoded, but configured NOT to be sent to the LLM (send_images_to_llm=False).")
         # Do not add the 'images' key to user_message_dict

    ollama_messages.append(user_message_dict)

    # 5. Query the Multimodal LLM using the underlying client and Parse the Response
    print("--- Sending Prompt via underlying ollama client ---")
    print(f"Prompt length (approx chars): {len(prompt_content)}")
    print(f"Number of base64 images being sent: {len(images_actually_sent)}") # Log count *actually* sent
    print("---------------------------------------------------------------------------")

    try:
        # (Keep Ollama client checking and calling logic the same)
        if not hasattr(llm_instance, '_client') and not hasattr(llm_instance, '_aclient'):
             raise AttributeError("LLM instance missing compatible Ollama client.")
        llm_options = { "temperature": LLM_TEMPERATURE, "num_predict": LLM_MAX_NEW_TOKENS }
        if hasattr(llm_instance, '_aclient'):
            response_obj = await llm_instance._aclient.chat( model=llm_instance.model, messages=ollama_messages, options=llm_options )
        # ... (elif for sync client) ...
        else: raise AttributeError("No Ollama client found.")

        # (Keep the corrected response object handling logic the same)
        completion_text = None
        if response_obj and hasattr(response_obj, 'message') and response_obj.message and \
           hasattr(response_obj.message, 'content') and isinstance(response_obj.message.content, str):
            completion_text = response_obj.message.content.strip()
            print("Successfully extracted content using attribute access.")
        else:
            print(f"Warning: Unexpected response structure: {response_obj}")
            return parsed_data_result, retrieved_images_for_return

        print(f"--- Raw LLM Response ---")
        print(completion_text)
        print("------------------------")

        # (Keep JSON Parsing logic the same)
        json_string = None
        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', completion_text, re.DOTALL | re.IGNORECASE)
        if json_match: json_string = json_match.group(1); print("Extracted JSON via markdown.")
        elif completion_text.startswith('{') and completion_text.endswith('}'): json_string = completion_text; print("Assuming raw response is JSON.")
        else: # Loose extraction
             json_start = completion_text.find('{'); json_end = completion_text.rfind('}')
             if json_start != -1 and json_end > json_start: json_string = completion_text[json_start:json_end+1]; print("Warning: Using loose JSON extraction.")
             else: print("ERROR: No JSON block found."); return parsed_data_result, retrieved_images_for_return

        if json_string:
            try:
                parsed_data = json.loads(json_string)
                if isinstance(parsed_data, dict) and isinstance(parsed_data.get("alphas"), list):
                     print("Successfully parsed JSON and validated structure.")
                     parsed_data_result = parsed_data
                else: print("Error: Parsed JSON lacks expected structure.")
            except json.JSONDecodeError as json_err: print(f"Error decoding JSON: {json_err}\nString: {json_string}")
            except Exception as parse_error: print(f"Error parsing JSON: {parse_error}\nString: {json_string}")

        return parsed_data_result, retrieved_images_for_return

    # (Keep except blocks the same)
    except AttributeError as attr_err: print(f"Attribute Error during LLM call: {attr_err}"); traceback.print_exc(); return parsed_data_result, retrieved_images_for_return
    except Exception as llm_error: print(f"Error during Ollama call: {llm_error}"); traceback.print_exc(); return parsed_data_result, retrieved_images_for_return


# --- Main Function ---
async def main():
    """Main execution function: Sets up MULTIMODAL index, runs query, generates alphas."""
    data_dir = "arxiv_pdfs"
    persist_dir = "storage_multimodal"
    # *** REDUCED TOP_K to limit context ***
    similarity_top_k_setting = 3 # Retrieve fewer documents/images

    # --- Environment Setup & PDF Check --- (Keep as is)
    if not os.path.exists(data_dir): print(f"Error: PDF dir '{data_dir}' not found."); return None
    pdf_files = glob.glob(os.path.join(data_dir, "*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in '{data_dir}'.")

    index = None; query_engine = None; llm_to_use = multi_modal_llm

    # --- Load or Create MULTIMODAL Index ---
    try: # Load index
        if not os.path.exists(persist_dir): raise FileNotFoundError("Persistence dir not found.")
        print(f"Attempting to load index from '{persist_dir}'..."); t_start_load = asyncio.get_event_loop().time()
        storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
        index = load_index_from_storage(storage_context, image_embed_model=image_embed_model)
        print(f"Index loaded successfully from storage. (Took {asyncio.get_event_loop().time() - t_start_load:.2f}s)")
        query_engine = index.as_query_engine(llm=llm_to_use, image_embed_model=image_embed_model, similarity_top_k = similarity_top_k_setting)
        print(f"Query Engine created. Retriever top_k={query_engine.retriever.similarity_top_k}")
    except Exception as load_error: # Create index
        print(f"Info: Failed to load index ({type(load_error).__name__}).")
        if not pdf_files: print("Error: No PDFs found and cannot load index."); return None
        print(f"Creating new index from '{data_dir}'...")
        try:
            print("--- Processing PDFs (with enhanced logging)... ---"); t_start_process = asyncio.get_event_loop().time()
            all_documents = []
            # Consider using asyncio.gather for parallel processing if many PDFs
            for pdf_path in pdf_files:
                processed_docs = process_pdf_for_multimodal(pdf_path)
                all_documents.extend(processed_docs)
            print(f"--- Finished processing PDFs. (Took {asyncio.get_event_loop().time() - t_start_process:.2f}s) ---")
            if not all_documents: print("Error: No documents processed."); return None
            text_count = sum(1 for d in all_documents if isinstance(d, Document) and not isinstance(d, ImageDocument))
            img_count = sum(1 for d in all_documents if isinstance(d, ImageDocument))
            print(f"Processed {len(all_documents)} total objects ({text_count} text, {img_count} image).")
            if img_count == 0: print("Warning: No images processed.")

            print("Building Multimodal Index (this may take time)..."); t_start_build = asyncio.get_event_loop().time()
            storage_context = StorageContext.from_defaults()
            index = MultiModalVectorStoreIndex.from_documents(all_documents, storage_context=storage_context, image_embed_model=image_embed_model, show_progress=True)
            print(f"Index created successfully. (Took {asyncio.get_event_loop().time() - t_start_build:.2f}s)")
            print(f"Persisting index to '{persist_dir}'..."); t_start_persist = asyncio.get_event_loop().time()
            os.makedirs(persist_dir, exist_ok=True)
            index.storage_context.persist(persist_dir=persist_dir)
            print(f"Index persisted. (Took {asyncio.get_event_loop().time() - t_start_persist:.2f}s)")
            query_engine = index.as_query_engine(llm=llm_to_use, image_embed_model=image_embed_model, similarity_top_k = similarity_top_k_setting)
            print(f"Query Engine created. Retriever top_k={query_engine.retriever.similarity_top_k}")
        except Exception as create_error: print(f"Fatal Error creating index: {create_error}"); traceback.print_exc(); return None

    # --- Run the Generation ---
    final_result_dict = {"alphas": []}
    if query_engine:
        # *** Control whether to send images to LLM ***
        # Set to True if using a multimodal LLM (like llava) and you WANT it to see images
        # Set to False if using text-only LLM (like llama3.2) or if you want to force reliance on text context only
        SEND_IMAGES = True
        print(f"\nConfiguration: Will send images to LLM: {SEND_IMAGES}")

        # Define query
        query = "Find all documents (with texts, images, charts or graphs), related to the generation of alpha factors."
        print(f"\n--- Generating Result for Query: '{query}' ---")
        try:
            # Pass the SEND_IMAGES flag to the generation function
            final_result_dict, retrieved_images_list = await generate_seed_alphas(
                query, query_engine, llm_to_use, send_images_to_llm=SEND_IMAGES
            )
            print(f"--- Alpha generation call completed. Retrieved {len(retrieved_images_list)} image node(s) during the process. ---")

        except Exception as gen_err:
             print(f"Error during alpha generation process: {gen_err}")
             traceback.print_exc()
             # Keep final_result_dict as the default empty list

        return final_result_dict
    else:
        print("Error: Query engine not initialized.")
        return None


# --- Script Entry Point ---
if __name__ == "__main__":
    print("Starting multimodal script...")
    final_json_result = None
    try:
        loop = asyncio.get_running_loop(); print("Event loop already running..."); final_json_result = await main()
    except RuntimeError: print("No event loop running..."); final_json_result = asyncio.run(main())
    except Exception as main_err: print(f"\n!!! Error during main execution: {main_err} !!!"); traceback.print_exc()

    # --- Result Handling ---
    print("\n--- Final Result ---")
    # (Keep result printing logic the same)
    if final_json_result and isinstance(final_json_result, dict) and 'alphas' in final_json_result:
        print("Generated Alphas (JSON Output):")
        try: print(json.dumps(final_json_result, indent=2))
        except Exception as json_dump_err: print(f"Error formatting JSON: {json_dump_err}\nRaw: {final_json_result}")
        print("--------------------")
    else:
        print("Script finished, but final result was not expected alpha dictionary.")
        if final_json_result is not None: print(f"Final result type: {type(final_json_result)}, value: {final_json_result}")
        else: print("Final result was None (likely due to error).")
        print("Check logs above.")

    print("\nScript finished.")

2025-07-15 20:48:31,217 - INFO - Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5


Setting up global settings...


2025-07-15 20:48:46,241 - INFO - 2 prompts are loaded, with the keys: ['query', 'text']


Embedding models set.
Starting multimodal script...
Event loop already running...
Found 120 PDF files in 'arxiv_pdfs'.
Attempting to load index from 'storage_multimodal'...


2025-07-15 20:48:52,008 - INFO - Loading all indices.


Index loaded successfully from storage. (Took 4.15s)
Query Engine created. Retriever top_k=3

Configuration: Will send images to LLM: True

--- Generating Result for Query: 'Find all documents (with texts, images, charts or graphs), related to the generation of alpha factors.' ---
--- Retrieving top 3 nodes for query: 'Find all documents (with texts, images, charts or graphs), related to the generation of alpha factor...' ---
--- Retrieved 5 nodes total ---
    Found TextNode: c44a71a0-eb5d-48e1-bf72-185716de7526 (Score: 0.6640)
    Found TextNode: 21064109-3ad5-4020-8b09-829a1b1dc383 (Score: 0.6489)
    Found TextNode: ec116cd3-55ed-4e1c-8752-5f01c2ed2f17 (Score: 0.6340)
    Found ImageNode: 9da09395-8ce9-4b41-abb6-0c884f7fb965 (Score: 0.2907)
    Found ImageNode: 0693c029-bd50-45f2-b002-ca7f3fd9a165 (Score: 0.2893)
---> Retrieved 2 images for context.
--- Encoding retrieved images to base64... ---
--- Encoded 2 images successfully. ---
Attaching 2 base64 encoded images to the request

2025-07-15 20:49:04,334 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


Successfully extracted content using attribute access.
--- Raw LLM Response ---
Here's a list of unique seed alphas related to generating alpha factors for daily stock market data in Hong Kong:

```
{
  "alphas": [
    {
      "domain": "Momentum",
      "name": "Price Momentum (10 days)",
      "code": "((CLOSE - DELAY(CLOSE, 10)) / DELAY(CLOSE, 10))"
    },
    {
      "domain": "Momentum",
      "name": "Moving Average Crossover (10 vs 50 day)",
      "code": "(SMA(CLOSE, 10) - SMA(CLOSE, 50))"
    },
    {
      "domain": "Momentum",
      "name": "Volume Momentum",
      "code": "VOLUME - DELAY(VOLUME, 50)"
    },
    {
      "domain": "Mean Reversion",
      "name": "Mean Reversion (20 days)",
      "code": "(MEAN(CLOSE, 20) - CLOSE)"
    },
    {
      "domain": "Mean Reversion",
      "name": "Moving Average Reversion",
      "code": "(SMA(CLOSE, 20) - CLOSE)"
    },
    {
      "domain": "Mean Reversion",
      "name": "Stochastic Oscillator (%K, 14-day)",
      "code": "(CLOS

# Load the results from LLM to a dataframe, then to a csv file

In [32]:
print(type(final_json_result))

<class 'dict'>


In [36]:
import pandas as pd
original_dfs = []


alphas = final_json_result.get("alphas", [])
original_df_new = pd.DataFrame(alphas)  # Directly create DataFrame from the list of dictionaries
original_dfs.append(original_df_new)

if original_dfs:
    original_combined_df = pd.concat(original_dfs, ignore_index=True)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

original_combined_df

Unnamed: 0,domain,name,code
0,Momentum,Price Momentum (10 days),"((CLOSE - DELAY(CLOSE, 10)) / DELAY(CLOSE, 10))"
1,Momentum,Moving Average Crossover (10 vs 50 day),"(SMA(CLOSE, 10) - SMA(CLOSE, 50))"
2,Momentum,Volume Momentum,"VOLUME - DELAY(VOLUME, 50)"
3,Mean Reversion,Mean Reversion (20 days),"(MEAN(CLOSE, 20) - CLOSE)"
4,Mean Reversion,Moving Average Reversion,"(SMA(CLOSE, 20) - CLOSE)"
5,Mean Reversion,"Stochastic Oscillator (%K, 14-day)","(CLOSE - MIN(LOW, 14)) / (MAX(HIGH, 14) - MIN(LOW, 14))"
6,Volatility,"Average True Range (ATR, 14-day)",(ATR(14))
7,Volatility,Daily High-Low Range,(HIGH - LOW)
8,Volatility,"Normalized Bollinger Band Width (20-day, 2 StdDev)","((SMA(CLOSE, 20) + 2 * STD(CLOSE, 20)) - (SMA(CLOSE, 20) - 2 * STD(CLOSE, 20))) / SMA(CLOSE, 20)"
9,Liquidity,"Volume Rate of Change (VROC, 10-day)","((VOLUME / DELAY(VOLUME, 10)) - 1)"


In [None]:
# (Add-on - not compulsory) Address the case when the formulas in code columns have outer quotes with the formulas.
import pandas as pd
import re  # Import the regular expression library

def remove_outer_quotes(code_string):
    """Removes outer quotes from a string if they exist."""
    if code_string and code_string.startswith('"') and code_string.endswith('"'):
        return code_string[1:-1]
    return code_string

# Apply the function to the 'code' column
original_combined_df['code'] = original_combined_df['code'].apply(remove_outer_quotes)

# Print the modified DataFrame
original_combined_df

In [37]:
original_combined_df

Unnamed: 0,domain,name,code
0,Momentum,Price Momentum (10 days),"((CLOSE - DELAY(CLOSE, 10)) / DELAY(CLOSE, 10))"
1,Momentum,Moving Average Crossover (10 vs 50 day),"(SMA(CLOSE, 10) - SMA(CLOSE, 50))"
2,Momentum,Volume Momentum,"VOLUME - DELAY(VOLUME, 50)"
3,Mean Reversion,Mean Reversion (20 days),"(MEAN(CLOSE, 20) - CLOSE)"
4,Mean Reversion,Moving Average Reversion,"(SMA(CLOSE, 20) - CLOSE)"
5,Mean Reversion,"Stochastic Oscillator (%K, 14-day)","(CLOSE - MIN(LOW, 14)) / (MAX(HIGH, 14) - MIN(LOW, 14))"
6,Volatility,"Average True Range (ATR, 14-day)",(ATR(14))
7,Volatility,Daily High-Low Range,(HIGH - LOW)
8,Volatility,"Normalized Bollinger Band Width (20-day, 2 StdDev)","((SMA(CLOSE, 20) + 2 * STD(CLOSE, 20)) - (SMA(CLOSE, 20) - 2 * STD(CLOSE, 20))) / SMA(CLOSE, 20)"
9,Liquidity,"Volume Rate of Change (VROC, 10-day)","((VOLUME / DELAY(VOLUME, 10)) - 1)"


In [38]:
original_combined_df.to_csv("HK_final_comprehensive_result_4.csv", index=False)