# Install and import libraries

In [None]:
# Install required packages
!pip install faiss-cpu sentence-transformers tqdm transformers accelerate



In [None]:
import os
import json
import glob
import numpy as np
import pandas as pd
import torch
import faiss
import tqdm
import gc
import time
import datetime
from typing import List, Dict, Any, Tuple, Optional
from dataclasses import dataclass
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

# Configuration

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/MSDS490_Project
# import os
# print(os.getcwd())

In [None]:
# Set device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Memory management utility functions
def clear_gpu_memory():
    """Clear GPU memory to free up resources."""
    torch.cuda.empty_cache()
    gc.collect()

def get_gpu_memory_usage():
    """Return GPU memory usage in MB."""
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024**2
    return 0

Using device: cuda


# Main Code

## Document Class

In [None]:
@dataclass
class Document:
    """Class to represent a document in the RAG system."""
    text: str
    metadata: Dict[str, Any]

## Financial Data Loader

In [None]:
class FinancialDataLoader:
    """Load and preprocess financial data from various sources."""

    def __init__(self, base_dir: str):
        """Initialize the data loader.

        Args:
            base_dir: Base directory containing the data folders
        """
        self.base_dir = base_dir
        self.tenk_dir = os.path.join(base_dir, "10-K")
        self.pr_dir = os.path.join(base_dir, "PR")
        self.earnings_dir = os.path.join(base_dir, "Earnings_Transcripts")
        self.sec_dir = os.path.join(base_dir, "SEC_Data")

        # Get all company folders
        self.companies = set()
        for directory in [self.tenk_dir, self.pr_dir, self.earnings_dir, self.sec_dir]:
            if os.path.exists(directory):
                self.companies.update([os.path.basename(f) for f in glob.glob(os.path.join(directory, "*"))
                                      if os.path.isdir(f)])
        print(f"Found {len(self.companies)} companies: {', '.join(self.companies)}")

    def load_10k_data(self) -> List[Document]:
        """Load 10-K report data."""
        documents = []

        print("Loading 10-K data...")
        for company in tqdm.tqdm(self.companies):
            company_dir = os.path.join(self.tenk_dir, company)
            if not os.path.exists(company_dir):
                continue

            for file_path in glob.glob(os.path.join(company_dir, "*.json")):
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)

                    # Extract filename components for metadata
                    filename = os.path.basename(file_path)

                    # Process each chunk in the file
                    if isinstance(data, list):
                        for i, chunk in enumerate(data):
                            if isinstance(chunk, dict) and 'text' in chunk:
                                doc = Document(
                                    text=chunk['text'],
                                    metadata={
                                        'source': '10-K',
                                        'company': company,
                                        'file': filename,
                                        'chunk_id': i,
                                        **{k: v for k, v in chunk.items() if k != 'text'}
                                    }
                                )
                                documents.append(doc)
                    elif isinstance(data, dict) and 'text' in data:
                        doc = Document(
                            text=data['text'],
                            metadata={
                                'source': '10-K',
                                'company': company,
                                'file': filename,
                                **{k: v for k, v in data.items() if k != 'text'}
                            }
                        )
                        documents.append(doc)
                except Exception as e:
                    print(f"Error loading 10-K file {file_path}: {e}")

        print(f"Loaded {len(documents)} 10-K document chunks")
        return documents

    def load_pr_data(self) -> List[Document]:
        """Load press release data."""
        documents = []

        print("Loading press release data...")
        for company in tqdm.tqdm(self.companies):
            company_dir = os.path.join(self.pr_dir, company)
            if not os.path.exists(company_dir):
                continue

            for file_path in glob.glob(os.path.join(company_dir, "*")):
                try:
                    # For raw text files
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()

                    # Extract date from filename if possible (assuming format like YYYY-MM-DD or similar)
                    filename = os.path.basename(file_path)

                    # Create document
                    doc = Document(
                        text=content,
                        metadata={
                            'source': 'Press Release',
                            'company': company,
                            'file': filename
                        }
                    )
                    documents.append(doc)
                except Exception as e:
                    print(f"Error loading PR file {file_path}: {e}")

        print(f"Loaded {len(documents)} press release documents")
        return documents

    def load_earnings_data(self) -> List[Document]:
        """Load earnings transcript data."""
        documents = []

        print("Loading earnings transcript data...")

        for company in tqdm.tqdm(self.companies):
            company_dir = os.path.join(self.earnings_dir, company)
            if not os.path.exists(company_dir):
                continue

            for file_path in glob.glob(os.path.join(company_dir, "*.json")):
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)

                    # Extract filename components for metadata
                    filename = os.path.basename(file_path)

                    # Process each chunk in the file
                    if isinstance(data, list):
                        for i, chunk in enumerate(data):
                            # Check for 'content' field instead of 'text'
                            if isinstance(chunk, dict) and 'content' in chunk:
                                # Extract metadata if available
                                chunk_metadata = chunk.get('metadata', {})

                                doc = Document(
                                    text=chunk['content'],  # Use 'content' instead of 'text'
                                    metadata={
                                        'source': 'Earnings Transcript',
                                        'company': company,
                                        'file': filename,
                                        'chunk_id': i,
                                        # Include the chunk's metadata if available
                                        **chunk_metadata,
                                        # Include any other top-level fields except 'content'
                                        **{k: v for k, v in chunk.items() if k not in ['content', 'metadata']}
                                    }
                                )
                                documents.append(doc)
                    elif isinstance(data, dict):
                        # Handle case where the file contains a single object instead of an array
                        if 'content' in data:
                            # Extract metadata if available
                            doc_metadata = data.get('metadata', {})

                            doc = Document(
                                text=data['content'],  # Use 'content' instead of 'text'
                                metadata={
                                    'source': 'Earnings Transcript',
                                    'company': company,
                                    'file': filename,
                                    # Include the document's metadata if available
                                    **doc_metadata,
                                    # Include any other top-level fields except 'content'
                                    **{k: v for k, v in data.items() if k not in ['content', 'metadata']}
                                }
                            )
                            documents.append(doc)
                except Exception as e:
                    print(f"Error loading earnings file {file_path}: {e}")

        print(f"Loaded {len(documents)} earnings transcript chunks")
        return documents

    def load_sec_data(self) -> List[Document]:
        """Load SEC financial metric data."""
        documents = []

        print("Loading SEC financial data...")
        for company in tqdm.tqdm(self.companies):
            company_dir = os.path.join(self.sec_dir, company)
            if not os.path.exists(company_dir):
                continue

            for file_path in glob.glob(os.path.join(company_dir, "*.json")):
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)

                    # For financial metrics, transform the JSON into a more readable text format
                    # so it can be embedded and retrieved effectively
                    if isinstance(data, dict):
                        for metric, values in data.items():
                            if isinstance(values, list) and len(values) > 0:
                                # Group by year for better context
                                by_year = {}
                                for entry in values:
                                    if 'end' in entry and 'val' in entry:
                                        year = entry['end'].split('-')[0]
                                        if year not in by_year:
                                            by_year[year] = []
                                        by_year[year].append(entry)

                                for year, entries in by_year.items():
                                    # Format the financial data as text
                                    text_content = f"Financial Metric: {metric} for {company} in {year}\n\n"
                                    for entry in entries:
                                        date = entry.get('end', 'N/A')
                                        value = entry.get('val', 'N/A')
                                        form = entry.get('form', 'N/A')
                                        filed = entry.get('filed', 'N/A')

                                        text_content += f"As of {date}, {metric} was {value:,} "
                                        text_content += f"reported in {form} filed on {filed}.\n"

                                    doc = Document(
                                        text=text_content,
                                        metadata={
                                            'source': 'SEC Financial Data',
                                            'company': company,
                                            'metric': metric,
                                            'year': year,
                                            'file': os.path.basename(file_path)
                                        }
                                    )
                                    documents.append(doc)
                except Exception as e:
                    print(f"Error loading SEC file {file_path}: {e}")

        print(f"Loaded {len(documents)} SEC financial data documents")
        return documents

    def load_all_data(self) -> List[Document]:
        """Load all available data."""
        all_docs = []

        # Load data from each source
        tenk_docs = self.load_10k_data()
        pr_docs = self.load_pr_data()
        earnings_docs = self.load_earnings_data()
        sec_docs = self.load_sec_data()

        # Combine all documents
        all_docs.extend(tenk_docs)
        all_docs.extend(pr_docs)
        all_docs.extend(earnings_docs)
        all_docs.extend(sec_docs)

        # Count documents by source
        source_counts = {}
        for doc in all_docs:
            source = doc.metadata.get('source', 'Unknown')
            if source not in source_counts:
                source_counts[source] = 0
            source_counts[source] += 1

        # Print document counts by source
        print("\nDocument counts by source:")
        for source, count in source_counts.items():
            print(f"  {source}: {count} documents")

        print(f"\nTotal documents loaded: {len(all_docs)}")
        return all_docs

## Vector Storage

In [None]:
class VectorStore:
    """FAISS vector store for document retrieval."""

    def __init__(self, embedding_model_name: str = "all-MiniLM-L6-v2"):
        """Initialize the vector store.

        Args:
            embedding_model_name: Name of the SentenceTransformer model to use
        """
        try:
            self.embedding_model = SentenceTransformer(embedding_model_name)
            self.embedding_dim = self.embedding_model.get_sentence_embedding_dimension()
            print(f"Loaded embedding model: {embedding_model_name} with dimension {self.embedding_dim}")
        except Exception as e:
            print(f"Error loading embedding model: {e}")
            raise

        self.index = None
        self.documents = []

    def add_documents(self, documents: List[Document], batch_size: int = 64):
        """Add documents to the vector store.

        Args:
            documents: List of documents to add
            batch_size: Batch size for processing embeddings
        """
        self.documents = documents

        # Create batches for memory efficiency
        text_batches = [
            [doc.text for doc in documents[i:i+batch_size]]
            for i in range(0, len(documents), batch_size)
        ]

        all_embeddings = []
        total_batches = len(text_batches)

        print(f"Computing embeddings for {len(documents)} documents in {total_batches} batches...")
        for i, batch in enumerate(text_batches):
            print(f"Processing batch {i+1}/{total_batches}...")
            batch_embeddings = self.embedding_model.encode(batch, show_progress_bar=True)
            all_embeddings.append(batch_embeddings)

            # Memory management
            if (i+1) % 10 == 0:
                clear_gpu_memory()
                print(f"GPU memory usage: {get_gpu_memory_usage():.2f} MB")

        embeddings = np.vstack(all_embeddings)
        print(f"Generated embeddings shape: {embeddings.shape}")

        # Create FAISS index
        self.index = faiss.IndexFlatL2(self.embedding_dim)
        self.index.add(embeddings.astype('float32'))
        print("FAISS index created successfully")

    def save(self, directory: str):
        """Save the vector store to disk.

        Args:
            directory: Directory to save the vector store
        """
        os.makedirs(directory, exist_ok=True)

        # Save FAISS index
        faiss.write_index(self.index, os.path.join(directory, "index.faiss"))

        # Save documents as JSON
        with open(os.path.join(directory, "documents.json"), 'w') as f:
            json.dump([
                {
                    'text': doc.text,
                    'metadata': doc.metadata
                }
                for doc in self.documents
            ], f)

        # Save embedding model name
        # Save embedding model name
        with open(os.path.join(directory, "config.json"), 'w') as f:
            # Store the model name directly if it was provided at initialization
            model_name = getattr(self.embedding_model, 'model_name', None)
            if model_name is None:
                # Try to get it from the tokenizer
                if hasattr(self.embedding_model, 'tokenizer') and hasattr(self.embedding_model.tokenizer, 'name_or_path'):
                    model_name = self.embedding_model.tokenizer.name_or_path
                else:
                    # Fallback: use the model's class name
                    model_name = self.embedding_model.__class__.__name__

            json.dump({
                'embedding_model': model_name,
                'embedding_dim': self.embedding_dim
            }, f)

        print(f"Vector store saved to {directory}")

    @classmethod
    def load(cls, directory: str):
        """Load a vector store from disk.

        Args:
            directory: Directory containing the saved vector store

        Returns:
            Loaded VectorStore instance
        """
        # Load config
        with open(os.path.join(directory, "config.json"), 'r') as f:
            config = json.load(f)

        # Create instance with saved embedding model
        vector_store = cls(embedding_model_name=config['embedding_model'])

        # Load FAISS index
        vector_store.index = faiss.read_index(os.path.join(directory, "index.faiss"))

        # Load documents
        with open(os.path.join(directory, "documents.json"), 'r') as f:
            docs_data = json.load(f)
            vector_store.documents = [
                Document(text=item['text'], metadata=item['metadata'])
                for item in docs_data
            ]

        print(f"Loaded vector store from {directory}")
        print(f"Index contains {vector_store.index.ntotal} vectors")
        print(f"Loaded {len(vector_store.documents)} documents")

        return vector_store


    def search(self, query: str, top_k: int = 5) -> List[Tuple[Document, float]]:
        """Search for similar documents to the query.

        Args:
            query: Query string
            top_k: Number of results to return

        Returns:
            List of (document, score) tuples
        """
        # Encode query
        query_embedding = self.embedding_model.encode([query])[0].reshape(1, -1).astype('float32')

        # Search
        distances, indices = self.index.search(query_embedding, top_k)

        # Return results
        results = [
            (self.documents[idx], float(distance))
            for idx, distance in zip(indices[0], distances[0])
            if idx < len(self.documents)  # Safety check for index bounds
        ]

        return results


## Prompt Builder

In [None]:
class PromptBuilder:
    """Build prompts for different audience types."""

    AUDIENCE_TEMPLATES = {
        "analyst": {
            "system": """You are an AI assistant specialized in financial analysis. You analyze corporate financial data, SEC filings, earnings transcripts, and press releases to provide detailed analytical insights. Focus on trends, metrics, comparative analysis, and factual reporting. Use precise financial terminology and cite your sources clearly.""",
            "human": """Analyze the following information about {companies} focused on {query}. Provide a comprehensive analysis with specific metrics, trends, and risk factors. Include inline citations to the source documents for all claims.

Context information:
{context}"""
        },
        "executive": {
            "system": """You are an AI assistant that provides concise executive summaries of financial information. You analyze corporate financial data, SEC filings, earnings transcripts, and press releases to extract strategic insights. Focus on high-level implications, market positioning, and business impact. Be direct, concise, and action-oriented in your analysis.""",
            "human": """Provide an executive summary about {companies} focused on {query}. Highlight key strategic insights, competitive positioning, and business implications. Be concise but thorough, and include inline citations to the source documents.

Context information:
{context}"""
        },
        "investor": {
            "system": """You are an AI assistant that analyzes financial information for investors. You review corporate financial data, SEC filings, earnings transcripts, and press releases to provide investment-focused insights. Focus on performance metrics, future outlook, risk assessment, and potential investment implications. Be balanced in your assessment and cite your sources clearly.""",
            "human": """Provide an investor-focused analysis of {companies} regarding {query}. Address performance trends, growth potential, risk factors, and competitive position. Include relevant metrics and inline citations to the source documents.

Context information:
{context}"""
        }
    }

    @classmethod
    def build_prompt(cls, query: str, retrieved_docs: List[Tuple[Document, float]],
                    audience: str = "analyst") -> Tuple[str, str]:
        """Build a prompt for the given query, retrieved documents, and audience type.

        Args:
            query: User query
            retrieved_docs: List of (document, score) tuples from vector search
            audience: Type of audience (analyst, executive, or investor)

        Returns:
            Tuple of (system_prompt, user_prompt)
        """
        if audience not in cls.AUDIENCE_TEMPLATES:
            audience = "analyst"  # Default to analyst

        # Extract companies from the retrieved documents
        companies = set()
        for doc, _ in retrieved_docs:
            if 'company' in doc.metadata:
                companies.add(doc.metadata['company'])

        companies_str = ", ".join(sorted(companies)) if companies else "the companies"

        # Format context with citations
        context_parts = []
        for i, (doc, score) in enumerate(retrieved_docs):
            # Create a citation identifier
            source_type = doc.metadata.get('source', 'Unknown')
            company = doc.metadata.get('company', 'Unknown')

            # Format date information if available
            date_info = ""
            if 'file' in doc.metadata:
                # Try to extract date from filename if it exists
                filename = doc.metadata['file']
                # Simple pattern matching for dates in filenames (can be enhanced)
                date_parts = [part for part in filename.split('_') if part.isdigit() and len(part) == 4]
                if date_parts:
                    date_info = f" ({date_parts[0]})"
                elif 'year' in doc.metadata:
                    date_info = f" ({doc.metadata['year']})"

            citation = f"[{source_type} - {company}{date_info}]"

            # Add the document text with citation
            context_parts.append(f"{doc.text}\n\n{citation}")

        context = "\n\n".join(context_parts)

        # Get templates for the audience
        templates = cls.AUDIENCE_TEMPLATES[audience]
        system_prompt = templates["system"]
        human_prompt = templates["human"].format(
            companies=companies_str,
            query=query,
            context=context
        )

        return system_prompt, human_prompt

## Mistral Model

In [None]:
class MistralModel:
    """Wrapper for Mistral model to generate responses."""

    def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.1",
                max_length: int = 2048, device_map: str = "auto"):
        """Initialize the Mistral model.

        Args:
            model_name: Name of the Mistral model to load
            max_length: Maximum length of generated text
            device_map: Device mapping strategy for model loading
        """
        try:
            print(f"Loading Mistral model: {model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                device_map=device_map,
                low_cpu_mem_usage=True
            )
            self.max_length = max_length
            print(f"Model loaded successfully on {self.model.device}")
        except Exception as e:
            print(f"Error loading model: {e}")
            raise

    def generate(self, system_prompt: str, user_prompt: str) -> str:
        """Generate a response for the given prompts.

        Args:
            system_prompt: System prompt
            user_prompt: User prompt

        Returns:
            Generated response
        """
        # Format prompt for Mistral
        prompt = f"<s>[INST] {system_prompt}\n\n{user_prompt} [/INST]"

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        # Calculate and print token count
        input_token_count = inputs.input_ids.shape[1]
        print(f"Input token count: {input_token_count}")

        # You can also print an estimated character-to-token ratio for future reference
        char_to_token_ratio = len(prompt) / input_token_count
        print(f"Character-to-token ratio: {char_to_token_ratio:.2f} characters per token")
        print(f"Total prompt length: {len(prompt)} characters")

        # Clear memory before generation
        clear_gpu_memory()
        print(f"GPU memory before generation: {get_gpu_memory_usage():.2f} MB")

        # Generate response
        with torch.no_grad():
            output = self.model.generate(
                **inputs,
                #max_length=self.max_length,
                max_new_tokens=700,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.1,
                do_sample=True
            )

        # Process output
        response = self.tokenizer.decode(output[0], skip_special_tokens=True)
        # Check if response contains the instruction end tag
        if "[/INST]" in response:
            # Split at the instruction end tag and take everything after it
            response = response.split("[/INST]", 1)[1].strip()
        else:
            # Fallback to string replacement if tag not found
            response = response.replace(prompt, "").strip()

        # Clean up memory
        del inputs, output
        clear_gpu_memory()
        print(f"GPU memory after generation: {get_gpu_memory_usage():.2f} MB")

        return response

## Logger

In [None]:
class Logger:
    """Log system interactions to text files."""

    def __init__(self, log_dir: str):
        """Initialize the logger.

        Args:
            log_dir: Directory to store log files
        """
        self.log_dir = log_dir
        os.makedirs(log_dir, exist_ok=True)

        # Create a unique log file name with timestamp
        current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        self.log_file = os.path.join(log_dir, f"chat_log_{current_time}.txt")

        # Initialize the log file with a header
        with open(self.log_file, 'w') as f:
            f.write(f"=== FINANCIAL ANALYSIS SYSTEM LOG ===\n")
            f.write(f"Started: {current_time}\n")
            f.write(f"{'='*40}\n\n")

    def log_interaction(self, query: str, audience: str, system_prompt: str,
                        user_prompt: str, response: str, context_docs: List,
                        response_time: float, max_context_chars: int = 2000):
        """Log a complete interaction to the log file.

        Args:
            query: The user's query
            audience: The selected audience type
            system_prompt: The system prompt used
            user_prompt: The user prompt generated
            response: The model's response
            context_docs: The retrieved context documents
            response_time: Time taken to generate response (seconds)
            max_context_chars: Maximum characters to include from context
        """
        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Format context for logging (truncated if too long)
        context_str = self._format_context(context_docs, max_context_chars)

        with open(self.log_file, 'a') as f:
            # Write timestamp and query details
            f.write(f"TIMESTAMP: {current_time}\n")
            f.write(f"QUERY: {query}\n")
            f.write(f"AUDIENCE: {audience}\n")
            f.write(f"RESPONSE TIME: {response_time:.2f} seconds\n\n")

            # Write system prompt
            f.write("SYSTEM PROMPT:\n")
            f.write(f"{system_prompt}\n\n")

            # Write context (truncated if needed)
            f.write("CONTEXT (RETRIEVED DOCUMENTS):\n")
            f.write(f"{context_str}\n\n")

            # Write user prompt and response
            f.write("USER PROMPT:\n")
            f.write(f"{user_prompt}\n\n")

            f.write("RESPONSE:\n")
            f.write(f"{response}\n\n")

            # Add separator for readability
            f.write(f"{'='*80}\n\n")

    def _format_context(self, context_docs: List, max_chars: int) -> str:
        """Format context documents for logging, with truncation.

        Args:
            context_docs: List of (Document, score) tuples
            max_chars: Maximum characters to include

        Returns:
            Formatted context string
        """
        context_parts = []
        total_chars = 0
        truncated = False

        for i, (doc, score) in enumerate(context_docs):
            # Create a citation identifier
            source_type = doc.metadata.get('source', 'Unknown')
            company = doc.metadata.get('company', 'Unknown')

            # Extract date info if available
            date_info = ""
            if 'file' in doc.metadata:
                filename = doc.metadata['file']
                date_parts = [part for part in filename.split('_') if part.isdigit() and len(part) == 4]
                if date_parts:
                    date_info = f"({date_parts[0]})"
                elif 'year' in doc.metadata:
                    date_info = f"({doc.metadata['year']})"

            # Create document summary
            doc_summary = f"[Doc {i+1}] {source_type} - {company} {date_info} (Relevance: {score:.4f})"
            doc_text = doc.text

            # Check if adding this document would exceed the max chars
            additional_chars = len(doc_summary) + len(doc_text) + 20  # 20 for formatting

            if total_chars + additional_chars > max_chars and i > 0:
                truncated = True
                break

            # Add this document
            context_parts.append(f"{doc_summary}\n{doc_text[:500]}...\n")
            total_chars += additional_chars

        # Add truncation notice if needed
        if truncated:
            context_parts.append(f"\n[... {len(context_docs) - i} more documents truncated for readability ...]\n")

        return "\n".join(context_parts)

## Analysis System

In [None]:
class FinancialAnalysisSystem:
    """Main class for the financial analysis system."""

    def __init__(self, base_dir: str, vector_store_dir: str = None):
        """Initialize the financial analysis system.

        Args:
            base_dir: Base directory containing the data folders
            vector_store_dir: Directory to load/save the vector store
        """
        self.base_dir = base_dir
        self.vector_store_dir = vector_store_dir
        self.vector_store = None
        self.model = None

    def initialize(self, rebuild_vector_store: bool = False,
                  model_name: str = "mistralai/Mistral-7B-Instruct-v0.1"):
        """Initialize the system components.

        Args:
            rebuild_vector_store: Whether to rebuild the vector store
            model_name: Name of the model to load
        """
        # Initialize vector store
        if self.vector_store_dir and os.path.exists(self.vector_store_dir) and not rebuild_vector_store:
            print(f"Loading existing vector store from {self.vector_store_dir}")
            self.vector_store = VectorStore.load(self.vector_store_dir)
        else:
            print("Building new vector store")
            # Load data
            data_loader = FinancialDataLoader(self.base_dir)
            documents = data_loader.load_all_data()

            # Create vector store
            self.vector_store = VectorStore()
            self.vector_store.add_documents(documents)

            # Save vector store if directory is specified
            if self.vector_store_dir:
                print(f"Saving vector store to {self.vector_store_dir}")
                self.vector_store.save(self.vector_store_dir)

        # Initialize Mistral model
        print(f"Initializing Mistral model: {model_name}")
        self.model = MistralModel(model_name=model_name)

        print("System initialization complete")

    def query(self, query: str, audience: str = "analyst", top_k: int = 10) -> str:
        """Process a query and generate a response.

        Args:
            query: User query
            audience: Type of audience (analyst, executive, or investor)
            top_k: Number of documents to retrieve

        Returns:
            Generated response
        """
        if not self.vector_store or not self.model:
            raise ValueError("System not initialized. Call initialize() first.")

        print(f"Processing query: '{query}' for audience: {audience}")

        # Retrieve relevant documents
        print(f"Retrieving top {top_k} documents...")
        retrieved_docs = self.vector_store.search(query, top_k=top_k)

        # Build prompt
        print("Building prompt...")
        system_prompt, user_prompt = PromptBuilder.build_prompt(
            query=query,
            retrieved_docs=retrieved_docs,
            audience=audience
        )

        # Generate response
        print("Generating response...")
        response = self.model.generate(system_prompt, user_prompt)

        return response

In [None]:
# Enhanced FinancialAnalysisSystem with logging
class EnhancedFinancialAnalysisSystem(FinancialAnalysisSystem):
    """Enhanced Financial Analysis System with logging capabilities."""

    def __init__(self, base_dir: str, vector_store_dir: str = None):
        """Initialize the financial analysis system with logging.

        Args:
            base_dir: Base directory containing the data folders
            vector_store_dir: Directory to load/save the vector store
        """
        super().__init__(base_dir, vector_store_dir)

        # Create logger
        log_dir = os.path.join(base_dir, "Chat_Logs")
        self.logger = Logger(log_dir)
        print(f"Logging enabled. Logs will be saved to: {log_dir}")

    def query(self, query: str, audience: str = "analyst", top_k: int = 10) -> str:
        """Process a query and generate a response with logging.

        Args:
            query: User query
            audience: Type of audience (analyst, executive, or investor)
            top_k: Number of documents to retrieve

        Returns:
            Generated response
        """
        if not self.vector_store or not self.model:
            raise ValueError("System not initialized. Call initialize() first.")

        print(f"Processing query: '{query}' for audience: {audience}")

        # Record start time
        start_time = time.time()

        # Retrieve relevant documents
        print(f"Retrieving top {top_k} documents...")
        retrieved_docs = self.vector_store.search(query, top_k=top_k)

        # Build prompt
        print("Building prompt...")
        system_prompt, user_prompt = PromptBuilder.build_prompt(
            query=query,
            retrieved_docs=retrieved_docs,
            audience=audience
        )

        # Generate response
        print("Generating response...")
        response = self.model.generate(system_prompt, user_prompt)

        # Calculate response time
        response_time = time.time() - start_time
        print(f"Response generated in {response_time:.2f} seconds")

        # Log the interaction
        self.logger.log_interaction(
            query=query,
            audience=audience,
            system_prompt=system_prompt,
            user_prompt=user_prompt,
            response=response,
            context_docs=retrieved_docs,
            response_time=response_time
        )

        return response



# Run system

In [None]:
# Example usage function with enhanced system
def run_enhanced_financial_analysis_system():
    """Run the enhanced financial analysis system with logging."""

    # Mount Google Drive (required for Colab)
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        base_dir = "/content/drive/MyDrive/MSDS490_Project"
    except:
        # Fallback for non-Colab environments
        base_dir = os.getcwd()

    vector_store_dir = os.path.join(base_dir, "vector_store")

    print("Initializing Enhanced Financial Analysis System with Logging...")
    system = EnhancedFinancialAnalysisSystem(base_dir, vector_store_dir)

    # Initialize with or without rebuilding the vector store
    rebuild = input("Rebuild vector store? (y/n): ").lower() == 'y'

    # Select model
    model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Default
    model_selection = input("Select model (1: Mistral-7B-Instruct, 2: TinyLlama-1.1B): ")
    if model_selection == "2":
        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    system.initialize(rebuild_vector_store=rebuild, model_name=model_name)

    # Interactive query loop
    while True:
        query = input("\nEnter your query (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        audience = input("Select audience (1: Analyst, 2: Executive, 3: Investor): ")
        audience_map = {
            "1": "analyst",
            "2": "executive",
            "3": "investor"
        }
        audience_type = audience_map.get(audience, "analyst")

        top_k = int(input("Number of documents to retrieve (3-20): ") or "10")
        top_k = max(3, min(20, top_k))  # Constrain between 3 and 20

        print("\nProcessing your query...\n")
        response = system.query(query, audience=audience_type, top_k=top_k)

        print("\n" + "="*80)
        print("RESPONSE:")
        print("="*80)
        print(response)
        print("="*80)

In [None]:
# # Example usage function
# def run_financial_analysis_system():
#     """Run the financial analysis system with example queries."""

#     # Mount Google Drive (required for Colab)
#     from google.colab import drive
#     drive.mount('/content/drive')

#     # Initialize the system
#     base_dir = "/content/drive/MyDrive/MSDS490_Project"
#     vector_store_dir = "/content/drive/MyDrive/MSDS490_Project/vector_store"

#     print("Initializing Financial Analysis System...")
#     system = FinancialAnalysisSystem(base_dir, vector_store_dir)

#     # Initialize with or without rebuilding the vector store
#     rebuild = input("Rebuild vector store? (y/n): ").lower() == 'y'

#     # Select model
#     # Select model
#     model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # Default
#     model_selection = input("Select model (1: Mistral-7B-Instruct, 2: TinyLlama-1.1B): ")
#     if model_selection == "2":
#         model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

#     system.initialize(rebuild_vector_store=rebuild, model_name=model_name)

#     # Interactive query loop
#     while True:
#         query = input("\nEnter your query (or 'quit' to exit): ")
#         if query.lower() == 'quit':
#             break

#         audience = input("Select audience (1: Analyst, 2: Executive, 3: Investor): ")
#         audience_map = {
#             "1": "analyst",
#             "2": "executive",
#             "3": "investor"
#         }
#         audience_type = audience_map.get(audience, "analyst")

#         top_k = int(input("Number of documents to retrieve (5-20): ") or "10")
#         top_k = max(5, min(20, top_k))  # Constrain between 5 and 20

#         print("\nProcessing your query...\n")
#         response = system.query(query, audience=audience_type, top_k=top_k)

#         print("\n" + "="*80)
#         print("RESPONSE:")
#         print("="*80)
#         print(response)
#         print("="*80)

In [None]:
# Execute the enhanced system
if __name__ == "__main__":
    run_enhanced_financial_analysis_system()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Initializing Enhanced Financial Analysis System with Logging...
Logging enabled. Logs will be saved to: /content/drive/MyDrive/MSDS490_Project/Chat_Logs
Rebuild vector store? (y/n): n
Select model (1: Mistral-7B-Instruct, 2: TinyLlama-1.1B): 1
Loading existing vector store from /content/drive/MyDrive/MSDS490_Project/vector_store
Loaded embedding model: sentence-transformers/all-MiniLM-L6-v2 with dimension 384
Loaded vector store from /content/drive/MyDrive/MSDS490_Project/vector_store
Index contains 119936 vectors
Loaded 119936 documents
Initializing Mistral model: mistralai/Mistral-7B-Instruct-v0.1
Loading Mistral model: mistralai/Mistral-7B-Instruct-v0.1


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Model loaded successfully on cuda:0
System initialization complete

Enter your query (or 'quit' to exit): Based on future growth projections, would it be better to pursue a job at RTX or Tesla?
Select audience (1: Analyst, 2: Executive, 3: Investor): 3
Number of documents to retrieve (3-20): 3

Processing your query...

Processing query: 'Based on future growth projections, would it be better to pursue a job at RTX or Tesla?' for audience: investor
Retrieving top 3 documents...
Building prompt...
Generating response...
Input token count: 637
Character-to-token ratio: 3.71 characters per token
Total prompt length: 2364 characters


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


GPU memory before generation: 13242.27 MB
GPU memory after generation: 13242.26 MB
Response generated in 111.15 seconds

RESPONSE:
Based on the available data, both RTX and TSLA have shown promising growth trends, but there are differences in their performance metrics, future outlook, risk factors, and competitive position.

Performance Metrics:
RTX has outperformed its peers in terms of local U.S. job creation in 2022. According to the Russell 1000, RTX created 8,967 jobs, which is significantly more than any other company in the same index. Additionally, RTX ranks first in employee-giving and volunteering, indicating a positive impact on the communities where they operate. On the other hand, TSLA reported a significant increase in revenue, with net sales increasing by 47% YoY to $15.3 billion in Q3 2022. However, their gross margin decreased from 25.2% to 19.5%, indicating higher production costs.

Future Outlook:
Both RTX and TSLA have expressed optimism about their future growth pr

In [None]:
# Import the main module
# from financial_analysis_rag import run_financial_analysis_system

# Execute the system
# if __name__ == "__main__":
#     run_financial_analysis_system()