In [6]:
# Import required libraries
import os
import faiss
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer

# Define paths
transcripts_dir = "Transcripts" 

# Step 1: Load and preprocess transcripts
def load_transcripts(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as f:
                documents.append(f.read())
    return documents

def split_into_chunks(text, chunk_size=1000, overlap=200):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

print("Loading and preprocessing transcripts...")
transcripts = load_transcripts(transcripts_dir)
chunks = [chunk for text in transcripts for chunk in split_into_chunks(text)]

In [None]:
# Step 2: Create embeddings and store in FAISS
print("Creating embeddings and storing in FAISS...")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.encode(chunks, convert_to_numpy=True)

# Create FAISS index
dimension = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embeddings)

In [None]:
# Step 3: Define retrieval function
def retrieve_relevant_chunks(query, k=5):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, k)
    results = [{"text": chunks[i], "distance": distances[0][j]} for j, i in enumerate(indices[0])]
    return results


In [None]:
# Step 4: Define LLM interface
class LLMInterface:
    def __init__(self, model_name="google/flan-t5-small"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        self.generator = pipeline("text2text-generation", model=self.model, tokenizer=self.tokenizer)

    def generate_response(self, query, context_chunks):
        context = "\n\n".join([chunk["text"] for chunk in context_chunks])
        prompt = f"""
        You are an expert financial analyst. Based on the following context from earnings call transcripts, answer the question:
        Context:
        {context}
        Question: {query}
        """
        response = self.generator(prompt, max_length=300, truncation=True)
        return response[0]['generated_text']

# Initialize LLM
llm = LLMInterface()

In [None]:
# Step 5: Define analytics
def analyze_response(query, context_chunks, response):
    analytics = {
        "query": query,
        "context_size": len(context_chunks),
        "retrieved_characters": sum(len(chunk["text"]) for chunk in context_chunks),
        "response_length": len(response),
        "response": response,
    }
    return analytics


In [None]:
# Step 6: Run retrieval, generation, and analytics
query = "What are the key risks mentioned in the earnings calls?"
print("Retrieving relevant chunks...")
start_retrieval = time.time()
context_chunks = retrieve_relevant_chunks(query)
end_retrieval = time.time()

print("Generating response...")
start_generation = time.time()
response = llm.generate_response(query, context_chunks)
end_generation = time.time()

# Print response
print("\nGenerated Response:")
print(response)

# Print analytics
analytics = analyze_response(query, context_chunks, response)
analytics["retrieval_time"] = end_retrieval - start_retrieval
analytics["generation_time"] = end_generation - start_generation
print("\nAnalytics:")
for key, value in analytics.items():
    print(f"{key}: {value}")

# AIS Score Calculation

In [None]:
import pandas as pd

# Define sub-criteria and queries
sub_criteria = {
    "Growth Potential": "What are the company's growth opportunities mentioned in the earnings calls?",
    "Management Quality": "What insights about management effectiveness are mentioned in the earnings calls?",
    "Earnings Quality": "What are the key factors influencing the company's earnings quality?",
    "Business Risks": "What risks were highlighted in the earnings calls?",
}

# Function to generate a score for a sub-criterion
def generate_score_and_reasoning(query, context_chunks):
    response = llm.generate_response(query, context_chunks)
    
    # Simulate score extraction (you can parse this more intelligently if needed)
    # For simplicity, assume the model outputs a score as part of the reasoning.
    score = float(response.split("Score:")[1].strip().split()[0]) if "Score:" in response else None
    return {"score": score, "reasoning": response}

# Function to calculate AIS for a single transcript
def calculate_ais(transcript_id, transcript_text):
    results = {"Transcript ID": transcript_id}
    transcript_chunks = split_into_chunks(transcript_text)

    # Encode and store chunks in FAISS for the specific transcript
    transcript_embeddings = embedding_model.encode(transcript_chunks, convert_to_numpy=True)
    faiss_index.add(transcript_embeddings)

    for criterion, query in sub_criteria.items():
        # Retrieve relevant chunks
        context_chunks = retrieve_relevant_chunks(query)
        
        # Generate score and reasoning
        result = generate_score_and_reasoning(query, context_chunks)
        results[f"{criterion} Score"] = result["score"]
        results[f"{criterion} Reasoning"] = result["reasoning"]
    
    # Calculate final AIS (average of sub-criteria scores)
    results["Final AIS"] = sum(results[f"{criterion} Score"] for criterion in sub_criteria if results[f"{criterion} Score"] is not None) / len(sub_criteria)
