In [3]:
import pdfplumber
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables
load_dotenv()

# Set API key
client = OpenAI(api_key = os.getenv("OPENAI_API_KEY"))


In [9]:
def get_embedding(text, model='text-embedding-3-small'):
    if not text.strip():
        text = "No content available on this page."
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def chunk_page_texts(page_texts, chunk_size=300, chunk_overlap=50):
    chunks = []
    for page_num, text in enumerate(page_texts):
        words = text.split()
        start = 0
        while start < len(words):
            end = start + chunk_size
            chunk = words[start:end]
            chunk_text = " ".join(chunk)
            chunks.append({
                "page": page_num + 1,
                "chunk_text": chunk_text
            })
            start += chunk_size - chunk_overlap
    return chunks

def get_similarities(query_text, rag_db, top_x=3):
    # Select the text to be queried
    query_embedding = get_embedding(query_text)

    # Compute similarities as a list of tuples
    similarities = [
        (row['embedding'], cosine_similarity([query_embedding], [row['embedding']])[0][0], row['chunk_text'])
        for _, row in rag_db.iterrows()
    ]

    # Sort by similarity (second element in the tuple) in descending order
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Retrieve top x results
    similarity_text = []
    for i in range(top_x):
        similarity_text.append(similarities[i][2])

    context_str = "\n\n".join(similarity_text)
    
    return context_str

def llm_response(question, df):
    # Fetch similarities
    top_k = 5
    context_str = get_similarities(question, df, top_k)

    # Create llm prompt
    prompt = create_prompt(question, context_str)

    # Call the LLM (e.g., GPT-4 or GPT-3.5)
    response = client.chat.completions.create(
        model="gpt-4",  # or "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
    )

    # Return LLM response
    return response.choices[0].message.content

def create_prompt(question, content):
    prompt = f"""
    You are a Chartered Finanacial Analyst. Answer the User Question based on the provided Context.
    
    User Question:
    {question}
    
    Context:
    {content}
    """
    return prompt


In [5]:
# Load DF
df = pd.read_json("tsla_chunks.json", orient="records", lines=True)

In [None]:
# Ask the LLM a question

question = "What is the name of the company that filed the document?"
# question = "What else do you know about the company that filed the document?"
#question = "What fiscal years are included the reporting period for the financial statements?"
#question = "How has the company’s revenue mix changed across segments or geographies compared to the prior year?"
#question = "What forward-looking statements or guidance does the company provide about future performance or strategy?"
#question = "How does the company describe its competitive advantages or market position in the Business section?"
#question = "Does the company disclose any material weaknesses in internal controls over financial reporting?"
#question = "What are the most significant risk factors identified by the company?"

print(llm_response(question, df))

Tesla is a company with a strong focus on corporate governance and business ethics. They have established an ESG Sustainability Council consisting of leaders from across their company, and maintain a worldwide employee headcount of 140,473 as of December 31, 2023. Tesla has a competitive edge in attracting and retaining high-quality employees, and they have an environment that fosters growth opportunities, with nearly two-thirds (65%) of their managers being promoted from internal, non-manager positions. 

Tesla has grown by 35% over the past two years, offering career development and meaningful contributions to a sustainable future. They retain employees by providing excellent health benefits, stock ownership opportunities, and continuous development training for leaders. 

Intellectual property is a priority for Tesla, with a focus on innovative approach and proprietary designs. They protect their intellectual property rights through patents, trademarks, copyrights, trade secrets, an