# Data Acquisition (Research Papers)

In [7]:
!pip install requests xmltodict



In [8]:
!pip install requests PyPDF2 nltk spacy gensim textblob



In [9]:
import requests
import json
import time
import os
import xmltodict
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English
from gensim import corpora, models
import sqlite3
from textblob import TextBlob  # Import TextBlob at the top level

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model ONCE, at the top level:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    os.system("python -m spacy download en_core_web_sm") # Download if not found
    nlp = spacy.load("en_core_web_sm")

In [10]:
search_keywords = ["alpha factors", 'alpha generation', 'alpha mining', 'factor investing', 'formulaic alphas', 'alpha portfolio construction', 'stock price prediction', 'artifical intelligence trading', 'AI stock selection', 'large language models finance', 'reinforcement learning portfolio', 'portfolio optimization', 'trading strategies', 'algorithmic trading', 'quantitative trading', 'mixture of experts trading']
len(search_keywords)

16

In [11]:
import requests
import json
import time
import os
import xmltodict
import PyPDF2  # For PDF processing (install: pip install PyPDF2)
import io # For in-memory file handling

ARXIV_API_URL = "http://export.arxiv.org/api/query"

def search_arxiv_papers(query, max_results=50, retry_count=3):
    """Searches arXiv and downloads PDFs."""

    params = {  # Put params here
        "search_query": query,
        "start": 0,
        "max_results": max_results,
        "id_list": ""
    }
    results = []  # Initialize results outside the try block

    try:
        for attempt in range(retry_count):
            response = requests.get(ARXIV_API_URL, params=params)
            response.raise_for_status()

            try:
                xml_dict = xmltodict.parse(response.content)
                entries = xml_dict.get('feed', {}).get('entry', [])

                # NOW you can iterate over entries:
                for entry in entries:
                    paper = parse_arxiv_entry(entry)
                    pdf_url = f"https://arxiv.org/pdf/{paper['id']}.pdf"
                    paper['pdf_path'] = download_pdf(pdf_url, paper['id'])

                    if paper['pdf_path']:
                        paper['full_text'] = extract_text_from_pdf(paper['pdf_path'])
                    if paper.get('full_text'):

                        paper['keywords'] = extract_keywords(paper['full_text'], method="spacy") # Use spacy
                        paper['summary'] = summarize_text(paper['full_text'])
                        paper['sentiment'] = analyze_sentiment(paper['full_text'])

                        print(f"Keywords: {paper['keywords']}")
                        print(f"Summary: {paper['summary']}")
                        print(f"Sentiment: {paper['sentiment']}")

                        store_in_db(paper) # Store the data in the database

                    results.append(paper)

                return results  # Return results after successful processing

            except (xmltodict.expat.ExpatError, KeyError) as e:
                print(f"Error parsing XML response: {e}. Retrying...")
                time.sleep(2**attempt)
                continue  # Retry the request

        print("Error: Failed to get valid response after multiple retries.")
        return [] # Return empty list if retries fail

    except requests.exceptions.RequestException as e:
        print(f"Error searching arXiv: {e}")
        return []

def download_pdf(pdf_url, paper_id):
    """Downloads a PDF given its URL and saves it locally."""
    try:
      # Create directory if doesn't exist
      pdf_dir = "arxiv_pdfs"
      os.makedirs(pdf_dir, exist_ok=True)
      file_path = os.path.join(pdf_dir, f"{paper_id}.pdf")

      response = requests.get(pdf_url, stream=True)  # Stream for large files
      response.raise_for_status()

      with open(file_path, "wb") as f:
          for chunk in response.iter_content(chunk_size=8192):
              f.write(chunk)
      print(f"Downloaded PDF: {file_path}")
      return file_path

    except requests.exceptions.RequestException as e:
        print(f"Error downloading PDF: {e}")
        return None

def extract_text_from_pdf(pdf_path):
    """Extracts text content from a PDF."""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            text = ""
            for page in reader.pages:
                text += page.extract_text()
            return text
    except (FileNotFoundError, PyPDF2.errors.PdfReadError) as e:
        print(f"Error extracting text from PDF: {e}")
        return None


def parse_arxiv_entry(entry):
    """Parses an individual arXiv entry (from XML) into a dictionary."""
    paper = {}
    paper['id'] = entry.get('id', '').split('/')[-1] if entry.get('id') else None # Extract ID
    paper['title'] = entry.get('title', '').replace('\n', ' ') # Remove newline
    paper['abstract'] = entry.get('summary', '').replace('\n', ' ') # Remove newline
    paper['authors'] = [author.get('name') for author in entry.get('author', []) if isinstance(author, dict)] # Handle multiple authors
    paper['categories'] = [cat.get('term') for cat in entry.get('category', []) if isinstance(cat, dict)]
    paper['journal-ref'] = entry.get('journal-ref')
    paper['doi'] = entry.get('doi')
    paper['submitted'] = entry.get('published') # Use published for submission date

    return paper

def extract_keywords(text, method="nltk"):  # Added method parameter
    """Extracts keywords from text (NLTK or spaCy)."""
    if text is None:
        return []

    text = text.lower() # Lowercasing for both methods
    if method == "nltk":
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(text)
        filtered_words = [w for w in word_tokens if not w in stop_words and w.isalnum()]
        return filtered_words[:20]  # Basic NLTK keyword extraction

    elif method == "spacy":
        doc = nlp(text)
        keywords = [token.lemma_ for token in doc if token.pos_ in ("NOUN", "PROPN", "ADJ") and not token.is_stop and token.is_alpha]
        return keywords[:20]  # More advanced spaCy keyword extraction

    else:
        return []

def summarize_text(text, num_sentences=3):
    """Summarizes text (basic example using sentence splitting)."""
    if text is None:
        return ""

    doc = nlp(text)
    sentences = list(doc.sents)
    if len(sentences) <= num_sentences:
        return text

    # Very basic summarization: take the first num_sentences
    summary = "".join([sent.text for sent in sentences[:num_sentences]])
    return summary

def analyze_sentiment(text):
    """Analyzes sentiment (basic example using TextBlob)."""
    if text is None:
        return "N/A" # Handle None case

    from textblob import TextBlob # Install: pip install textblob
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 and 1

def create_topics(texts, num_topics=5):
    """Creates topics using LDA."""
    if not texts:
        return []

    tokenized_texts = [[word for word in text.lower().split() if word.isalnum() and word not in stopwords.words('english')] for text in texts if text]

    dictionary = corpora.Dictionary(tokenized_texts)
    corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)
    topics = lda_model.show_topics()
    return topics

def store_in_db(paper_data):
    conn = sqlite3.connect('arxiv_papers.db')
    cursor = conn.cursor()

    # Define the columns (important for dynamic handling)
    columns = [
        'id', 'title', 'abstract', 'authors', 'categories', 'journal_ref', 'doi',
        'submitted', 'pdf_path', 'full_text', 'keywords', 'summary', 'sentiment'
    ]

    # Create the table if it doesn't exist (only needs to be done once)
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS papers (
            id TEXT PRIMARY KEY,
            title TEXT,
            abstract TEXT,
            authors TEXT,
            categories TEXT,
            journal_ref TEXT,
            doi TEXT,
            submitted TEXT,
            pdf_path TEXT,
            full_text TEXT,
            keywords TEXT,
            summary TEXT,
            sentiment REAL
        )
    ''')

    try:
        # 1. Build the VALUES part of the query dynamically
        values_placeholders = ", ".join(["?"] * len(columns))  # "?, ?, ..., ?"

        # 2. Extract the values in the correct order, handling missing data
        values = []
        for col in columns:
            value = paper_data.get(col)
            if col in ('authors', 'categories', 'keywords') and value is not None: #For lists
              value = json.dumps(value)
            elif value is None:
              value = 'N/A' # Handle None case
            values.append(value)

        # 3. Construct and execute the query
        query = f"INSERT INTO papers ({', '.join(columns)}) VALUES ({values_placeholders})"
        cursor.execute(query, tuple(values))  # Important: tuple(values)

        conn.commit()
    except sqlite3.IntegrityError:
        print(f"Paper with id {paper_data.get('id')} already exists. Skipping.")
    except sqlite3.Error as e:  # Catch other potential database errors
        print(f"Database error: {e}")
        conn.rollback() # Rollback in case of error

    conn.close()

search_keywords = ["alpha factors", 'alpha generation', 'alpha mining', 'factor investing', 'formulaic alphas', 'alpha portfolio construction', 'stock price prediction', 'artifical intelligence trading', 'AI stock selection', 'large language models finance', 'reinforcement learning portfolio', 'portfolio optimization', 'trading strategies', 'algorithmic trading', 'quantitative trading', 'mixture of experts trading']
# search_keywords = ["financial time series prediction", 'mixture of experts', 'portfolio optimization', 'finance', 'stock selection', 'factor investing', 'deep reinforcement learning', 'trading', 'large language models', 'alpha generation', 'alpha mining']

all_papers = []  # Store all papers from all searches

for keyword in search_keywords:
    papers = search_arxiv_papers([keyword], max_results=5)  # Search for ONE keyword
    
    if papers:
        all_papers.extend(papers)  # Add the new papers to the combined list
        for paper in papers: # process each paper
            print(json.dumps(paper, indent=4))
            if paper.get('full_text'):
                print(f"Full Text (First 200 characters of {len(paper['full_text'])}):")
                print(paper['full_text'][:200] + "...")  # Or even less
                print()
                print('Processing:')
                text_file_path = os.path.join("arxiv_texts", f"{paper['id']}.txt")  # New directory
                os.makedirs("arxiv_texts", exist_ok=True) # Create directory if doesn't exist
                with open(text_file_path, "w", encoding="utf-8") as text_file:  # UTF-8 encoding
                    text_file.write(paper['full_text'])
                print(f"Full text saved to: {text_file_path}")
                # ... (Process paper data, including full text)

else:
    print("No papers found.")

# Example of topic modeling (after searching for papers):
texts = [paper.get('full_text') for paper in papers if paper.get('full_text')]
topics = create_topics(texts)
print("\nTopics:")
for topic in topics:
    print(topic)

KeyboardInterrupt: 

In [None]:
len(all_papers)

# First LLM Agent for Phase I (displayed a comprehensive set of alpha factors)

In [12]:
!pip install requests xmltodict



In [13]:
!pip install requests PyPDF2 nltk spacy gensim textblob



In [16]:
!pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting numpy<2.1.0,>=1.26.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Downloading numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m195.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hDownloading numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl (5.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling col

In [14]:
import requests
import json
import time
import os
import xmltodict
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English
from gensim import corpora, models
import sqlite3
from textblob import TextBlob  # Import TextBlob at the top level

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model ONCE, at the top level:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    os.system("python -m spacy download en_core_web_sm") # Download if not found
    nlp = spacy.load("en_core_web_sm")

In [18]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
import asyncio
import json
from llama_index.core import Settings, StorageContext, load_index_from_storage  # Import missing classes
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, Document, SimpleDirectoryReader, Settings
import fitz
from PIL import Image
import re
import io
import os  # Import os for directory checking
import pandas as pd #For table handling if needed
import glob
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from typing import List, Dict
# ... other imports (fitz, vector database library, etc.)


# Set global settings
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = Ollama(model="llama3.2", request_timeout=720.0)

text_embedding_model = SentenceTransformer('BAAI/bge-base-en-v1.5')
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


def process_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    document_data = {
        "text": "",
        "images": [],
        }

    for page in doc:
        document_data["text"] += page.get_text()
    
    for page in doc: #Iterate through all pages.
        for image in page.get_images(): # Get images without width/height
            xref = image[0]
            base_image = doc.extract_image(xref)
            if base_image: # Check if image extracted
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]

            try:
                image = Image.open(io.BytesIO(image_bytes)) # Use PIL to resize
                new_image = image.resize((2000, 2000)) # Resize with PIL
                # Convert the resized image back to bytes (if needed for later use)
                image_bytes_resized = io.BytesIO()
                new_image.save(image_bytes_resized, format=image_ext.upper()) # Save in original format
                image_bytes_resized = image_bytes_resized.getvalue()
                document_data["images"].append(image_bytes_resized) # Append image bytes
            except Exception as e:
                print(f"Error processing or resizing image: {e}")
                continue # Skip to the next image if there's an issue
    return document_data
    
async def retrieve_documents(query, query_engine): # Add query_engine parameter
    pdf_directory = "arxiv_pdfs"
    pdf_paths = glob.glob(os.path.join(pdf_directory, "*.pdf"))
    # pdf_paths = ["arxiv_pdfs/2409.06289v1.pdf"]
    # pdf_paths = ["arxiv_pdfs/2103.16196v2.pdf", "arxiv_pdfs/2308.00016v1.pdf"] # Replace with your PDF file paths
    documents = []
    for pdf_path in pdf_paths:
        data = process_pdf(pdf_path) # Process each pdf file
        # Create LlamaIndex Documents
        text_document = Document(text=data["text"], metadata={"source": pdf_path})
        documents.append(text_document)
        # Handle images
        for image_bytes in data["images"]:
            image_document = Document(text="Image", metadata={"image_bytes": image_bytes, "source": pdf_path})
            documents.append(image_document)

    response = await query_engine.aquery(query)
    return response.source_nodes


def get_multimodal_embeddings(data):
    text_emb = text_embedding_model.encode(data["text"])
    image_embs = []
    for image in data["images"]:
        inputs = clip_processor(images=[image], return_tensors="pt")
        with torch.no_grad():
            image_features = clip_model.get_image_features(**inputs)
        image_emb = image_features.cpu().numpy()
        image_embs.append(image_emb)
    return text_emb, image_embs #, table_embs, figure_embs

async def retrieving_documents_and_creating_multimodal_input(query, query_engine):
    # 1. Retrieve relevant documents
    retrieved_documents = await retrieve_documents(query, query_engine)

    # 2. Process documents and create multimodal input
    multimodal_input = ""
    for source_node in retrieved_documents:
        doc = source_node.node.text
        multimodal_input += doc
        if "images" in source_node.node.metadata:
            for image_bytes in source_node.node.metadata["images"]:
                try:
                    image = Image.open(io.BytesIO(image_bytes))
                    image_summary = f"Image from document: {source_node.node.text[:50]}..."
                    multimodal_input += f"Image: {image_summary}\n"
                except Exception as e:
                    print(f"Error processing image for summary: {e}")
    return multimodal_input

async def generate_seed_alphas(query, query_engine):

    multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # Query the LLM
    prompt = f"""Generate *50 unqiue seed alphas* related to: {query}. Categorize them into financial domains (Momentum, Mean Reversion, Volatility, Fundamental, Liquidity, Quality, Growth, Technical, Micro Economics etc.) and provide the alpha name and code.  Focus on alphas suitable for daily stock market data.

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Momentum",
                "name": "Price Momentum (14 days)",
                "code": "((CLOSE - DELAY(CLOSE, 14)) / DELAY(CLOSE, 14))"
                }},
                {{
                "domain": "Mean Reversion",
                "name": "Mean Reversion (20 days)",
                "code": "(MEAN(CLOSE, 20) - CLOSE)"
                }},
                {{
                "domain": "Volatility",
                "name": "20-Day Volatility",
                "code": "STD(CLOSE, 20)"
                }},
                {{
                "domain": "Fundamental",
                "name": "Price-to-Earnings Ratio (P/E)",
                "code": "(CLOSE / EPS)"
                }},
                {{
                "domain": "Liquidity",
                "name": "Trading Volume",
                "code": "VOLUME"
                }},
                {{
                "domain": "Quality",
                "name": "Gross Profit Margin",
                "code": "(GROSS_PROFIT / REVENUE)"
                }},
                {{
                "domain": "GROWTH",
                "name": "Earnings Growth Rate",
                "code": "(EPS / DELAY(EPS,1) - 1)"
                }},
                {{
                "domain": "Technical",
                "name": "Moving Average (MA)",
                "code": "SMA(CLOSE, 20)"
                }},
                {{
                "domain": "Macro Economics",
                "name": "GDP Growth Rate",
                "code": "GDP - DELAY (GDP, n)"
                }},
                // ... more examples (at least 8-10 per domain if possible)
            ]
            }}
            ```

            *It is absolutely crucial that the response is valid JSON and nothing else.*  Do not include any explanatory text outside the JSON object.  If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""
    
    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Raw Response (Before Cleanup): {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def main():
    data_dir = "arxiv_pdfs"  # Define the data directory

    if not os.path.exists(data_dir):
        print(f"Error: Directory '{data_dir}' does not exist. Create it and add your PDF files.")
        return

    try:
        storage_context = StorageContext.from_defaults(persist_dir="storage")
        index = load_index_from_storage(storage_context)
        query_engine = index.as_query_engine()
        print("Index loaded from storage.")
    except Exception:
        documents = SimpleDirectoryReader(data_dir).load_data()
        index = VectorStoreIndex.from_documents(documents)
        query_engine = index.as_query_engine()
        index.storage_context.persist("storage")
        print("New index created and persisted.")

    json_text = await generate_seed_alphas("research on momentum strategies", query_engine)
    return json_text

if __name__ == "__main__":
    json_text = await main() # Use asyncio.run to execute the async main function

Index loaded from storage.
Raw Response (Before Cleanup): ```
{
  "alphas": [
    {
      "domain": "Momentum",
      "name": "Price Momentum (14 days)",
      "code": "((CLOSE - DELAY(CLOSE, 14)) / DELAY(CLOSE, 14))"
    },
    {
      "domain": "Mean Reversion",
      "name": "Mean Reversion (20 days)",
      "code": "(MEAN(CLOSE, 20) - CLOSE)"
    },
    {
      "domain": "Volatility",
      "name": "20-Day Volatility",
      "code": "STD(CLOSE, 20)"
    },
    {
      "domain": "Fundamental",
      "name": "Price-to-Earnings Ratio (P/E)",
      "code": "(CLOSE / EPS)"
    },
    {
      "domain": "Liquidity",
      "name": "Trading Volume",
      "code": "VOLUME"
    },
    {
      "domain": "Quality",
      "name": "Gross Profit Margin",
      "code": "(GROSS_PROFIT / REVENUE)"
    },
    {
      "domain": "GROWTH",
      "name": "Earnings Growth Rate",
      "code": "(EPS / DELAY(EPS,1) - 1)"
    },
    {
      "domain": "Technical",
      "name": "Moving Average (MA)",
      "co

In [20]:
type(json_text)

dict

In [21]:
print(json_text)

{'alphas': [{'domain': 'Momentum', 'name': 'Price Momentum (14 days)', 'code': '((CLOSE - DELAY(CLOSE, 14)) / DELAY(CLOSE, 14))'}, {'domain': 'Mean Reversion', 'name': 'Mean Reversion (20 days)', 'code': '(MEAN(CLOSE, 20) - CLOSE)'}, {'domain': 'Volatility', 'name': '20-Day Volatility', 'code': 'STD(CLOSE, 20)'}, {'domain': 'Fundamental', 'name': 'Price-to-Earnings Ratio (P/E)', 'code': '(CLOSE / EPS)'}, {'domain': 'Liquidity', 'name': 'Trading Volume', 'code': 'VOLUME'}, {'domain': 'Quality', 'name': 'Gross Profit Margin', 'code': '(GROSS_PROFIT / REVENUE)'}, {'domain': 'GROWTH', 'name': 'Earnings Growth Rate', 'code': '(EPS / DELAY(EPS,1) - 1)'}, {'domain': 'Technical', 'name': 'Moving Average (MA)', 'code': 'SMA(CLOSE, 20)'}, {'domain': 'Macro Economics', 'name': 'GDP Growth Rate', 'code': 'GDP - DELAY(GDP, 1)'}, {'domain': 'Momentum', 'name': 'Relative Strength Index (RSI)', 'code': '(100 - (100 / (1 + RS)) * (CLOSE / PREV_CLOSE))'}, {'domain': 'Mean Reversion', 'name': 'Bollinger 

In [23]:
original_dfs = []


alphas = json_text["alphas"]
original_df_new = pd.DataFrame(alphas)  # Directly create DataFrame from the list of dictionaries
original_dfs.append(original_df_new)

if original_dfs:
    original_combined_df = pd.concat(original_dfs, ignore_index=True)

pd.set_option('display.max_rows', None)

original_combined_df

Unnamed: 0,domain,name,code
0,Momentum,Price Momentum (14 days),"((CLOSE - DELAY(CLOSE, 14)) / DELAY(CLOSE, 14))"
1,Mean Reversion,Mean Reversion (20 days),"(MEAN(CLOSE, 20) - CLOSE)"
2,Volatility,20-Day Volatility,"STD(CLOSE, 20)"
3,Fundamental,Price-to-Earnings Ratio (P/E),(CLOSE / EPS)
4,Liquidity,Trading Volume,VOLUME
5,Quality,Gross Profit Margin,(GROSS_PROFIT / REVENUE)
6,GROWTH,Earnings Growth Rate,"(EPS / DELAY(EPS,1) - 1)"
7,Technical,Moving Average (MA),"SMA(CLOSE, 20)"
8,Macro Economics,GDP Growth Rate,"GDP - DELAY(GDP, 1)"
9,Momentum,Relative Strength Index (RSI),(100 - (100 / (1 + RS)) * (CLOSE / PREV_CLOSE))


In [24]:
original_combined_df.to_csv("testing.csv", index=False)

# Second LLM Agent for Phase I (asked for each financial domain each time)

In [1]:
!pip install requests xmltodict



In [2]:
!pip install requests PyPDF2 nltk spacy gensim textblob



In [3]:
import requests
import json
import time
import os
import xmltodict
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en import English
from gensim import corpora, models
import sqlite3
from textblob import TextBlob  # Import TextBlob at the top level

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy model ONCE, at the top level:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    os.system("python -m spacy download en_core_web_sm") # Download if not found
    nlp = spacy.load("en_core_web_sm")

In [4]:
!pip install llama-index
!pip install llama-index-core llama-index-readers-file llama-index-llms-ollama llama-index-embeddings-huggingface
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding



In [5]:
from llama_index.llms.ollama import Ollama

print("Ollama imported successfully!")  # If this prints, the import works

# If the import works, then try a minimal Ollama interaction:

# You can type this command (in the terminal of your computer) to use this LLM : ollama run llama3.2
try:
  llm = Ollama(model="llama3.2") # or another model available to you
  print("Ollama LLM instantiated!")
except Exception as e:
  print(f"Error instantiating Ollama: {e}")

Ollama imported successfully!
Ollama LLM instantiated!


In [None]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import CLIPProcessor, CLIPModel
import asyncio
import json
from llama_index.core import Settings, StorageContext, load_index_from_storage  # Import missing classes
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, Document, SimpleDirectoryReader, Settings
import fitz
from PIL import Image
import re
import io
import os  # Import os for directory checking
import pandas as pd #For table handling if needed
import glob
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from typing import List, Dict


# Set global settings
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = Ollama(model="llama3.2", request_timeout=720.0)

text_embedding_model = SentenceTransformer('BAAI/bge-base-en-v1.5')
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


def process_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    document_data = {
        "text": "",
        "images": [],
        }

    for page in doc:
        document_data["text"] += page.get_text()
    
    for page in doc: #Iterate through all pages.
        for image in page.get_images(): # Get images without width/height
            xref = image[0]
            base_image = doc.extract_image(xref)
            if base_image: # Check if image extracted
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]

            try:
                image = Image.open(io.BytesIO(image_bytes)) # Use PIL to resize
                new_image = image.resize((2000, 2000)) # Resize with PIL
                # Convert the resized image back to bytes (if needed for later use)
                image_bytes_resized = io.BytesIO()
                new_image.save(image_bytes_resized, format=image_ext.upper()) # Save in original format
                image_bytes_resized = image_bytes_resized.getvalue()
                document_data["images"].append(image_bytes_resized) # Append image bytes
            except Exception as e:
                print(f"Error processing or resizing image: {e}")
                continue # Skip to the next image if there's an issue
    return document_data
    
async def retrieve_documents(query, query_engine): # Add query_engine parameter
    pdf_directory = "arxiv_pdfs"
    pdf_paths = glob.glob(os.path.join(pdf_directory, "*.pdf"))
    # pdf_paths = ["arxiv_pdfs/2409.06289v1.pdf"]
    # pdf_paths = ["arxiv_pdfs/2103.16196v2.pdf", "arxiv_pdfs/2308.00016v1.pdf"] # Replace with your PDF file paths
    documents = []
    for pdf_path in pdf_paths:
        data = process_pdf(pdf_path) # Process each pdf file
        # Create LlamaIndex Documents
        text_document = Document(text=data["text"], metadata={"source": pdf_path})
        documents.append(text_document)
        # Handle images
        for image_bytes in data["images"]:
            image_document = Document(text="Image", metadata={"image_bytes": image_bytes, "source": pdf_path})
            documents.append(image_document)

    response = await query_engine.aquery(query)
    return response.source_nodes


def get_multimodal_embeddings(data):
    text_emb = text_embedding_model.encode(data["text"])
    image_embs = []
    for image in data["images"]:
        inputs = clip_processor(images=[image], return_tensors="pt")
        with torch.no_grad():
            image_features = clip_model.get_image_features(**inputs)
        image_emb = image_features.cpu().numpy()
        image_embs.append(image_emb)
    return text_emb, image_embs #, table_embs, figure_embs

async def retrieving_documents_and_creating_multimodal_input(query, query_engine):
    # 1. Retrieve relevant documents
    retrieved_documents = await retrieve_documents(query, query_engine)

    # 2. Process documents and create multimodal input
    multimodal_input = ""
    for source_node in retrieved_documents:
        doc = source_node.node.text
        multimodal_input += doc
        if "images" in source_node.node.metadata:
            for image_bytes in source_node.node.metadata["images"]:
                try:
                    image = Image.open(io.BytesIO(image_bytes))
                    image_summary = f"Image from document: {source_node.node.text[:50]}..."
                    multimodal_input += f"Image: {image_summary}\n"
                except Exception as e:
                    print(f"Error processing image for summary: {e}")
    return multimodal_input

async def generate_alphas_for_momentum(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)

    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Momentum*. Provide the alpha name and code. Focus on alphas suitable for daily stock market data. *The domain of all alpha factors must be "Momentum".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Momentum",
                "name": "Price Momentum",
                "code": "(CLOSE - DELAY(CLOSE, 14)"
                }},
                {{
                "domain": "Momentum",
                "name": "Volume Momentum",
                "code": "(VOLUME - DELAY(VOLUME, 14))"
                }},
                {{
                "domain": "Momentum",
                "name": "RSI Momentum",
                "code": "(RSI - DELAY(RSI, 14))"
                }}
                // ... more examples (at least 8-10 if possible)
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_alphas_for_mean_reversion(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)

    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Mean Reversion*. Provide the alpha name and code. Focus on alphas suitable for daily stock market data. *The domain of all alpha factors must be "Mean Reversion".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Mean Reversion",
                "name": "Mean Reversion (20 days)",
                "code": "(MEAN(CLOSE, 20) - CLOSE)"
                }},
                {{
                "domain": "Mean Reversion",
                "name": "Z-score Mean Reversion",
                "code": "(CLOSE - MEAN(CLOSE, 20)) / STD(CLOSE, 20)"
                }},
                {{
                "domain": "Mean Reversion",
                "name": "Bollinger Bands",
                "code": "(CLOSE - LOWER_BAND) / (UPPER_BAND - LOWER_BAND)"
                }},
                // ... more examples
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_alphas_for_volatility(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Volatility*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Volatility".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Volatility",
                "name": "Standard Deviation",
                "code": "STD(CLOSE, 20)"
                }},
                {{
                "domain": "Volatility",
                "name": "Average True Range (ATR)",
                "code": "ATR(14)"
                }},
                {{
                "domain": "Volatility",
                "name": "Bollinger Band Width",
                "code": "(UPPER_BAND - LOWER_BAND) / SMA(CLOSE, 20)"
                }},
                // ... more examples
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_alphas_for_funadmental(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Fundamental*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Fundamental".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Fundamental",
                "name": "Price-to-Earnings Ratio (P/E)",
                "code": "(CLOSE / EPS)"
                }},
                {{
                "domain": "Fundamental",
                "name": "Price-to-Book Ratio (P/E)",
                "code": "(CLOSE / BOOK_VALUE)"
                }},
                {{
                "domain": "Fundamental",
                "name": "Dividend Yield",
                "code": "(DIVIDENDS / CLOSE)"
                }},
                // ... more examples
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_alphas_for_liquidity(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Liquidity*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Liquidity".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Liquidity",
                "name": "Trading Volume",
                "code": "VOLUME"
                }},
                {{
                "domain": "Liquidity",
                "name": "Average Trading Volume",
                "code": "(HIGH - LOW)/ CLOSE"
                }},
                {{
                "domain": "Liquidity",
                "name": "Volume Rate of Change (VROC)",
                "code": "(VOLUME - DELAY(VOLUME, 14)) / DELAY(VOLUME, 14)"
                }},
                // ... more examples 
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None

async def generate_alphas_for_quality(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Quality*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Quality".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Quality",
                "name": "Gross Profit Margin",
                "code": "(GROSS_PROFIT / REVENUE)"
                }},
                {{
                "domain": "Quality",
                "name": "Operating Profit Margin",
                "code": "(OPERATING_INCOME / REVENUE)"
                }},
                {{
                "domain": "Quality",
                "name": "Net Profit Margin",
                "code": "(NET_INCOME / REVENUE)"
                }},
                // ... more examples
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_alphas_for_growth(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Growth*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Growth".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Growth",
                "name": "Earnings Growth Rate",
                "code": "(EPS / DELAY(EPS,1) - 1)"
                }},
                {{
                "domain": "Growth",
                "name": "Revenue Growth Rate",
                "code": "(REVENUE / DELAY(REVENUE, 1) - 1)"
                }},
                {{
                "domain": "Growth",
                "name": "EBITDA Growth Rate",
                "code": "(EBITDA / DELAY(EBITDA, 1) - 1)"
                }},
                // ... more examples 
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None

async def generate_alphas_for_technical(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Technical*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Technical".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Technical",
                "name": "Moving Average (MA)",
                "code": "SMA(CLOSE, 20)"
                }},
                {{
                "domain": "Technical",
                "name": "Exponential Moving Average (EMA)",
                "code": "EMA(CLOSE, 20)"
                }},
                {{
                "domain": "Technical",
                "name": "Relative Strength Index (RSI)",
                "code": "RSI(14)"
                }},
                // ... more examples
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.*  Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.*  If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e:
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_alphas_for_macro(query, query_engine, multimodal_input):

    # multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    
    response_schemas = [
        ResponseSchema(name="alphas", description="A list of alpha objects."),
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    # 3. Query the LLM
    prompt = f"""Generate *unique seed alphas* related to: *Macro Economics*.  Provide the alpha name and code.  Focus on alphas suitable for daily stock market data.*The domain name of all alpha factors must be "Macro Economics".*

            Return the result as a *valid JSON object (dictionary)*.  The JSON object *must* have the following structure:

            ```json
            {{
            "alphas": [
                {{
                "domain": "Macro Economics",
                "name": "GDP Growth Rate",
                "code": "GDP - DELAY (GDP, n)"
                }},
                {{
                "domain": "Macro Economics",
                "name": "Inflation Rate",
                "code": "CPI - DELAY (GDP, n)"
                }},
                {{
                "domain": "Macro Economics",
                "name": "Unemployment Rate",
                "code": "UNEMPLOYMENT_RATE - DELAY(UNEMPLOYMENT_RATE, n)"
                }},
                // ... more examples (at least 8-10 per domain if possible)
            ]
            }}
            ```
            *It is absolutely crucial that the response is valid JSON, and nothing else.* Do not include other alpha factors' domains. *Do not include any explanatory text outside the JSON object.* If you cannot generate any alphas, return an empty JSON object: `{{ "alphas": [] }}`.
            Make sure all the keys (domain, name, code) are enclosed in double quotes. 
            {multimodal_input}"""

    try:
        response = await Settings.llm.acomplete(prompt + multimodal_input)
        completion_text = response.text
        print(f"Output from LLM: {completion_text}")
        print()
        try:
            parsed_output = output_parser.parse(completion_text)
            return parsed_output
        except Exception as parse_error:
            print(f"Error parsing LLM output: {parse_error}")
            return {"alphas": []} #return empty json on error
    except Exception as e
        print(f"LLM or other Error: {e}")
        return None
    
async def generate_all_domain_alphas(query, query_engine, multimodal_input):
    # domains = ["Momentum", "Mean Reversion","Volatility", "Fundamental", "Liquidity", "Quality", "Growth", "Technical", "Micro Economics"]
    all_alphas = []

    domain_alphas_json = await generate_alphas_for_momentum(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_mean_reversion(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_volatility(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_funadmental(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_liquidity(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_quality(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_growth(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_technical(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)

    domain_alphas_json = await generate_alphas_for_macro(query, query_engine, multimodal_input)
    all_alphas.append(domain_alphas_json)
    
    return all_alphas


async def main():
    data_dir = "arxiv_pdfs"
    if not os.path.exists(data_dir):
        print(f"Error: Directory '{data_dir}' does not exist. Create it and add your PDF files.")
        return

    try:
        storage_context = StorageContext.from_defaults(persist_dir="storage")
        index = load_index_from_storage(storage_context)
        query_engine = index.as_query_engine()
        print("Index loaded from storage.")
    except Exception:
        documents = SimpleDirectoryReader(data_dir).load_data()
        index = VectorStoreIndex.from_documents(documents)
        query_engine = index.as_query_engine()
        index.storage_context.persist("storage")
        print("New index created and persisted.")

    query = "research on momentum strategies"
    # query = "research on alpha factors"
    multimodal_input = await retrieving_documents_and_creating_multimodal_input(query, query_engine)
    all_alphas = await generate_all_domain_alphas(query, query_engine, multimodal_input)
    return all_alphas

if __name__ == "__main__":
    json_new_text = await main() # Use asyncio.run to execute the async main function

Index loaded from storage.
Output from LLM: ```
{
    "alphas": [
        {
            "domain": "Momentum",
            "name": "Price Momentum",
            "code": "(CLOSE - DELAY(CLOSE, 14))"
        },
        {
            "domain": "Momentum",
            "name": "Volume Momentum",
            "code": "(VOLUME - DELAY(VOLUME, 14))"
        },
        {
            "domain": "Momentum",
            "name": "RSI Momentum",
            "code": "(RSI - DELAY(RSI, 14))"
        },
        {
            "domain": "Momentum",
            "name": "Stochastic Momentum",
            "code": "(STOCHASTIC - DELAY(STOCHASTIC, 14))"
        },
        {
            "domain": "Momentum",
            "name": "Moving Average Momentum",
            "code": "(SMA - DELAY(SMA, 14))"
        },
        {
            "domain": "Momentum",
            "name": "Bollinger Bands Momentum",
            "code": "(BBAND - DELAY(BBAND, 14))"
        },
        {
            "domain": "Momentum",
           

In [18]:
new_dfs = []

for item in json_new_text:
    alphas = item["alphas"]
    df_new = pd.DataFrame(alphas)  # Directly create DataFrame from the list of dictionaries
    new_dfs.append(df_new)

if new_dfs:
    new_combined_df = pd.concat(new_dfs, ignore_index=True)

pd.set_option('display.max_rows', None)

new_combined_df

Unnamed: 0,domain,name,code
0,Momentum,Price Momentum,"(CLOSE - DELAY(CLOSE, 14))"
1,Momentum,Volume Momentum,"(VOLUME - DELAY(VOLUME, 14))"
2,Momentum,RSI Momentum,"(RSI - DELAY(RSI, 14))"
3,Momentum,Stochastic Momentum,"(STOCHASTIC - DELAY(STOCHASTIC, 14))"
4,Momentum,Moving Average Momentum,"(SMA - DELAY(SMA, 14))"
5,Momentum,Bollinger Bands Momentum,"(BBAND - DELAY(BBAND, 14))"
6,Momentum,Force Index Momentum,"(FORCEINDEX - DELAY(FORCEINDEX, 14))"
7,Momentum,Rate of Change Momentum,"(RATEOFCHANGE - DELAY(RATEOFCHANGE, 14))"
8,Momentum,Money Flow Index Momentum,"(MFI - DELAY(MFI, 14))"
9,Mean Reversion,Mean Reversion (20 days),"(MEAN(CLOSE, 20) - CLOSE)"


In [19]:
new_combined_df.to_csv("best_results_ever.csv", index=False)