In [None]:
!pip install yfinance
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install faiss-cpu
!pip install py2neo

In [None]:
import yfinance as yf
import random
import json

# Company Codes
company_tickers = [
    "AAPL", "TSLA", "GOOGL", "AMZN", "MSFT", "FB", "NFLX", "NVDA", "INTC", "AMD",
    "BABA", "BA", "WMT", "PFE", "MRNA", "JPM", "GS", "V", "MA", "PYPL",
    "KO", "PEP", "XOM", "CVX", "SPCE", "UBER", "LYFT", "TWTR", "SHOP", "SQ",
    "T", "VZ", "IBM", "ORCL", "CSCO", "ADBE", "CRM", "ZM", "DOCU", "SNOW",
    "DIS", "SBUX", "MCD", "NKE", "HD", "COST", "LOW", "TGT", "WBA", "CVS",
    "TXN", "QCOM", "MU", "AVGO", "LRCX", "AMD", "REGN", "GILD", "BIIB", "MRK",
    "LLY", "ABBV", "ABT", "BMY", "DHR", "MDT", "ISRG", "SYK", "BDX", "ZTS",
    "NVS", "AZN", "SNY", "ROG", "BIDU", "JD", "PDD", "NTES", "TME", "BILI",
    "TSM", "SNE", "NTDOY", "NSRGY", "HSBC", "RY", "TD", "BCS", "BNS", "BBVA",
    "DB", "CS", "UBS", "JPM", "BAC", "C", "MS", "WFC", "GS", "AXP"
]

# Extract financial data
def fetch_and_save_financial_data(ticker_list, num_companies=100):
    financial_data = []
    selected_tickers = random.sample(ticker_list, num_companies)

    for ticker in selected_tickers:
        print(f"Fetching financial data for {ticker}...")
        stock = yf.Ticker(ticker)
        balance_sheet = stock.balance_sheet

        if not balance_sheet.empty:
            #  Extract financial data, defaulting to None if missing
            try:
                total_assets = balance_sheet.loc["Total Assets"].iloc[0]
            except KeyError:
                total_assets = None
                print(f"Total Assets not found for {ticker}")

            # Checking alternative fields if primary ones are missing
            try:
                total_liabilities = balance_sheet.loc["Total Liabilities"].iloc[0]
            except KeyError:
                total_liabilities = balance_sheet.loc["Long Term Debt"].iloc[0] if "Long Term Debt" in balance_sheet.index else None
                if total_liabilities is None:
                    print(f"Total Liabilities not found for {ticker} (tried alternative fields).")

            try:
                total_equity = balance_sheet.loc["Total Stockholder Equity"].iloc[0]
            except KeyError:
                total_equity = balance_sheet.loc["Net Worth"].iloc[0] if "Net Worth" in balance_sheet.index else None
                if total_equity is None:
                    print(f"Total Stockholder Equity not found for {ticker} (tried alternative fields).")

            # Converting Timestamp to string for JSON serialization
            try:
                year = str(balance_sheet.columns[0])  # Convert timestamp to string
            except Exception:
                year = "Unknown"

            # Appending if at least one piece of financial data is present
            if total_assets is not None or total_liabilities is not None or total_equity is not None:
                data = {
                    "ticker": ticker,
                    "Total Assets": total_assets,
                    "Total Liabilities": total_liabilities,
                    "Total Stockholder Equity": total_equity,
                    "Year": year  # Year as string to avoid JSON serialization issues
                }
                financial_data.append(data)
        else:
            print(f"No balance sheet data found for {ticker}")

    # Saving the fetched data to a file
    with open("financial_data_100_companies.json", "w") as f:
        json.dump(financial_data, f)

    print("Financial data for 100 companies saved successfully.")
    return financial_data

# Fetch and save financial data
financial_data = fetch_and_save_financial_data(company_tickers, num_companies=100)


In [None]:
# Creating a question-answer dataset using the financial data
def prepare_question_answer_dataset(financial_data):
    dataset = []

    for company in financial_data:
        questions = [
            f"What are the total assets of {company['ticker']} in {company['Year']}?",
            f"What are the total liabilities of {company['ticker']} in {company['Year']}?",
            f"What is the total shareholder equity of {company['ticker']} in {company['Year']}?"
        ]

        answers = [
            f"The total assets of {company['ticker']} in {company['Year']} were {company['Total Assets']} USD.",
            f"The total liabilities of {company['ticker']} in {company['Year']} were {company['Total Liabilities']} USD.",
            f"The total shareholder equity of {company['ticker']} in {company['Year']} was {company['Total Stockholder Equity']} USD."
        ]

        for q, a in zip(questions, answers):
            dataset.append({"question": q, "answer": a})

    return dataset

# Preparing the dataset
financial_finetune_dataset = prepare_question_answer_dataset(financial_data)

# Saving the dataset as a JSON file
with open("financial_finetune_dataset.json", "w") as f:
    json.dump(financial_finetune_dataset, f)

print("Financial dataset prepared and saved.")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, DataCollatorForLanguageModeling
import torch
from datasets import Dataset
import json
from accelerate import Accelerator

# Loading the fine-tuning dataset
with open("financial_finetune_dataset.json", "r") as f:
    financial_data = json.load(f)

# Preparing the dataset for Hugging Face's format
dataset = Dataset.from_list(financial_data)

# Loading tokenizer and model
model_name = "NousResearch/LLaMA-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initializing model with ZeRO-Offload to CPU
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

# Tokenizing the dataset
def tokenize_data(example):
    question = example['question']
    answer = example['answer']
    return tokenizer(
        f"Question: {question} Answer: {answer}",
        truncation=True,
        max_length=512,
        padding="max_length"
    )

# Tokenizing the dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True, remove_columns=["question", "answer"])

# Data collator to convert batch to tensors
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initializing Accelerator for memory-efficient training
accelerator = Accelerator()

# Initializing optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Preparing model, optimizer, and data loaders with Accelerator
model, optimizer, tokenized_dataset = accelerator.prepare(model, optimizer, tokenized_dataset)

# Training loop
for epoch in range(5):
    model.train()
    for batch in tokenized_dataset:
        batch = data_collator([batch])

        # Moving batch to the correct device using accelerator
        batch = {k: v.to(accelerator.device) for k, v in batch.items()}


        print(f"Input IDs shape: {batch['input_ids'].shape}")
        print(f"Input IDs: {batch['input_ids']}")
        print(f"Attention Mask shape: {batch['attention_mask'].shape}")
        print(f"Attention Mask: {batch['attention_mask']}")

        # Checking if the input has a sufficient sequence length
        if len(batch['input_ids'].shape) > 1 and batch['input_ids'].shape[1] > 1:
            # Pass the inputs to the model
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            optimizer.zero_grad()
        else:
            print(f"Skipping batch with shape: {batch['input_ids'].shape}")

    print(f"Epoch {epoch + 1} completed.")

# Saving the fine-tuned model
accelerator.wait_for_everyone()
model.save_pretrained("finetuned_llama_model")
tokenizer.save_pretrained("finetuned_llama_model")
print("Fine-tuned LLaMA model saved.")


In [None]:
# Checking the contents of financial_data to see which keys are present
print(financial_data[:5])

# Preparing financial documents for FAISS, handling missing fields
documents = []
for company in financial_data:
    # Handling missing fields gracefully with defaults
    ticker = company.get('ticker', 'Unknown')
    total_assets = company.get('Total Assets', 'N/A')
    total_liabilities = company.get('Total Liabilities', 'N/A')
    shareholder_equity = company.get('Total Stockholder Equity', 'N/A')

    document = f"{ticker} Financials: Total Assets {total_assets}, Total Liabilities {total_liabilities}, Shareholder Equity {shareholder_equity}"
    documents.append(document)

index, embedding_model, embedding_tokenizer = setup_faiss(documents)


In [None]:
# Preparing financial documents for FAISS based on questions and answers
documents = [f"Question: {entry['question']} Answer: {entry['answer']}" for entry in financial_data]


index, embedding_model, embedding_tokenizer = setup_faiss(documents)

print(f"Total documents indexed: {len(documents)}")


In [None]:
import torch
import faiss
from transformers import AutoModel, AutoTokenizer

#  Querying FAISS index for relevant financial documents based on user query
def query_faiss(query, index, embedding_model, embedding_tokenizer, documents, k=3):
    # Tokenize and encode the query
    encoded_query = embedding_tokenizer([query], return_tensors="pt", padding=True, truncation=True)

    # Moving inputs to the same device as the model
    encoded_query = {k: v.to(embedding_model.device) for k, v in encoded_query.items()}

    # Generating the embedding for the query
    with torch.no_grad():
        query_embedding = embedding_model(**encoded_query).last_hidden_state.mean(dim=1).cpu().numpy()

    # Searching FAISS index to get top-k results
    _, indices = index.search(query_embedding, k)

    # Retrieving the top-k relevant documents
    relevant_docs = [documents[i] for i in indices[0]]

    return relevant_docs

# Example query
user_query = "What are the total assets of AAPL?"

# Query FAISS to retrieve relevant financial documents
relevant_docs = query_faiss(user_query, index, embedding_model, embedding_tokenizer, documents)

# Display the results
print("Relevant Financial Information:")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Result {i}: {doc}")


In [None]:
#  Adding Neo4j Integration
from py2neo import Graph, Node, Relationship

graph = Graph("neo4j+s://8d1ce1f5.databases.neo4j.io", auth=("neo4j", "1kb73GatQB5RE_hfad_pOVdM-F8v3RDbE9PHCYBmGiI"))

def query_neo4j(company_ticker):
    query = f"""
    MATCH (c:Company)-[:HAS_FINANCIAL_DATA]->(f:Financials)
    WHERE c.name='{company_ticker}'
    RETURN f.total_assets, f.total_liabilities, f.shareholder_equity
    """
    result = graph.run(query).data()

    if result:
        financials = result[0]
        return f"Total Assets: {financials['f.total_assets']}, Total Liabilities: {financials['f.total_liabilities']}, Shareholder Equity: {financials['f.shareholder_equity']}"
    else:
        return "No structured financial data found for this company in Neo4j."


In [None]:
# Full Pipeline
def generate_llm_response(query, retrieved_docs, llama_model, llama_tokenizer):
    # Combine the user query with retrieved FAISS documents
    context = "\n".join(retrieved_docs)
    full_prompt = f"User question: {query}\n\nFinancial Information:\n{context}\n\nAnswer:"

    # Tokenize the input and move it to the GPU
    inputs = llama_tokenizer(full_prompt, return_tensors="pt").to("cuda")

    # Generate a response using the LLaMA model, controlling max_new_tokens
    with torch.no_grad():
        output = llama_model.generate(**inputs, max_new_tokens=150)

    # Decode the generated response
    response = llama_tokenizer.decode(output[0], skip_special_tokens=True)
    return response


def full_pipeline(user_query, faiss_index, embedding_model, embedding_tokenizer, documents, llama_model, llama_tokenizer):
    company_ticker = user_query.split()[-1].upper().replace("?", "")

    neo4j_data = query_neo4j(company_ticker)

    if "No structured financial data" not in neo4j_data:
        return f"Neo4j Financial Data: {neo4j_data}"
    else:
        relevant_docs = query_faiss(user_query, faiss_index, embedding_model, embedding_tokenizer, documents)
        final_response = generate_llm_response(user_query, relevant_docs, llama_model, llama_tokenizer)
        return f"FAISS-based Financial Data: {final_response}"


In [None]:
user_query = "Should I invest in AAPL?"

# Run the full pipeline
final_response = full_pipeline(user_query, index, embedding_model, embedding_tokenizer, documents, model, tokenizer)

# Display the final response
print("Final Response:")
print(final_response)


In [None]:
import yfinance as yf

def get_ground_truth(ticker):
    stock = yf.Ticker(ticker)
    balance_sheet = stock.balance_sheet
    if not balance_sheet.empty:
        total_assets = balance_sheet.loc["Total Assets"].iloc[0] if "Total Assets" in balance_sheet.index else None
        total_liabilities = balance_sheet.loc["Total Liabilities"].iloc[0] if "Total Liabilities" in balance_sheet.index else None
        total_equity = balance_sheet.loc["Total Stockholder Equity"].iloc[0] if "Total Stockholder Equity" in balance_sheet.index else None

        return {
            "Total Assets": total_assets,
            "Total Liabilities": total_liabilities,
            "Total Stockholder Equity": total_equity
        }
    return None

# Example to generate ground truth for AAPL
ground_truth = get_ground_truth("AAPL")
print(ground_truth)


In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Example LLaMA-generated response
generated_response = "The total assets of AAPL in 2021 were 350 billion USD."

# Ground truth retrieved from Yahoo Finance
ground_truth = get_ground_truth("AAPL")
ground_truth_answer = f"The total assets of AAPL in 2021 were {ground_truth['Total Assets']} USD."

#  BLEU score
reference = [ground_truth_answer.split()]  # Ground truth answer
candidate = generated_response.split()  # Generated response from LLaMA

bleu_score = sentence_bleu(reference, candidate)
print(f"BLEU score: {bleu_score}")
