In [3]:
import pandas as pd 
import numpy as np
import os
from openai import OpenAI
from dotenv import load_dotenv

In [4]:
# Run text embedding on pdfs 
load_dotenv()
open_ai_key = os.getenv("OPENAI_APIKEY")
client = OpenAI(
    api_key=open_ai_key,
)

In [31]:
from PyPDF2 import PdfReader
def pdf_to_string(pdf_path):
    # Create a PDF reader object
    pdf_reader = PdfReader(pdf_path)
    
    # Initialize an empty string to store the text
    text = ""
    
    # Iterate through all pages and extract text
    for page in pdf_reader.pages:
        text += page.extract_text()
    
    # Heuristics to fix processing
    processed_text = text.strip()
    processed_text = processed_text.strip().replace("\n", " ").replace("\t", " ").replace(" ", " ")
    processed_text = processed_text.replace("●", "")
    return processed_text.strip()

base_path = "/Users/snagaraj/omnis/econ43"
econ_notes_path = os.path.join(base_path, "ECON 43 Notes.pdf")
econ43_notes_str = pdf_to_string(econ_notes_path)

In [35]:
def chunk_str(input_str, chunk_size=8192):
    """
    Chunk str into pieces of approximately chunk_size words each
    """
    words = input_str.split()
    chunked_strs = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunked_strs.append(chunk)
    return chunked_strs

In [36]:
chunk_strs = chunk_str(econ43_notes_str)
for chunk_str in chunk_strs:
    response = client.embeddings.create(
        input=chunk_str,
        model="text-embedding-3-small"
    )
    print(response.data[0].embedding)

[0.015901580452919006, 0.04561557620763779, 0.041809845715761185, 0.09485059231519699, 0.00958752166479826, 0.04510992020368576, -0.02656029537320137, 0.02813049405813217, 0.004321370739489794, 0.025362687185406685, 0.06429827213287354, 0.015595524571835995, 0.008795768953859806, -0.011796443723142147, 0.009733895771205425, 0.00720561109483242, 0.012075886130332947, 0.019387951120734215, -0.01272126380354166, 0.014876958914101124, -0.00038298522122204304, -0.006603480316698551, -0.036034710705280304, -0.015276161953806877, 0.016819747164845467, 0.022568266838788986, -0.024444520473480225, 0.031537026166915894, 0.01677982695400715, -0.007631427608430386, 0.08489713817834854, -0.010166365653276443, 0.011829710565507412, -0.036886341869831085, -0.004833681043237448, 0.0537060871720314, -0.00544246518984437, 0.007358639035373926, 0.05434481427073479, 0.005332684610038996, 0.031137822195887566, -0.0003515895805321634, -0.05956106260418892, 0.017445163801312447, -0.01677982695400715, 0.03611

In [44]:
ID_MODEL_SYSTEM_PROMPT = """
You are a helpful assistant specializing in identifying parts of text documents that best correspond to the answer for a query.
You specialize in thinking deeply about the answer to a given question and then returning the exact sentences word for word that 
best contain the answer to the question from the given context. The context is preceded by a section header called CONTEXT.
"""

# Define the user message
question = "What constitutes a well diversified portfolio in investing?"

# Add context
context = f"CONTEXT:\n {econ43_notes_str}"
INPUT_MSG = question + context

# Make the API call to o3-mini
id_response = client.chat.completions.create(
    model="o1-mini",
    messages = [
    {"role": "user", "content": f"instructions {ID_MODEL_SYSTEM_PROMPT}\n, question: {INPUT_MSG}"}]
   # messages=[
   #     {"role": "system", "content": ID_MODEL_SYSTEM_PROMPT},
   #     {"role": "user", "content": INPUT_MSG}
   # ],
   # reasoning_effort="low"  # Options: "low", "medium", "high"
)

print(id_response.choices[0].message.content)

```
Well diversified portfolio has at least 12 positions from different industries with no more than 10% in any single investment ○ Try to have real estate, VC, or PE investments (assets other than stocks and bonds)
```


In [51]:
id_response_content = id_response.choices[0].message.content
# Post-processing
id_response_content = id_response_content.replace("\n", " ").replace('`', "")

In [56]:
ID_GENERATION_SYSTEM_PROMPT = """
You are a helpful teaching assistant who generates answers to students questions with a kind and helpful tone.
The way you answer questions is as follows:
You should first provide any necessary background on the student's question at the level of a high school or college student.
Then, you should answer the question directly using your knowledge. Try to integrate the document context that is passed in somewhere in your answer. 
The context is preceded by a section header called CONTEXT.
"""

#To correctly reference the document context, you will add an HTML hyperlinks in correspondence of the key concepts discussed. 
#For example, if you have a document describing XYZ and URL to the document, you would discuss XYZ and you will write important words or expressions of the discussion in the form of HTML hyperlink. 
#In other words, if WORD is an important word of the summary that describes a document having link URL, you will write <a href=URL>WORD</a> instead of WORD in the summary.

# Define the user message
question = "What constitutes a well diversified portfolio in investing?"

# Add context
context = f"CONTEXT:\n {id_response_content}"
INPUT_MSG = question + context

# Make the API call to o3-mini
generated_response = client.chat.completions.create(
    model="gpt-4o",
    messages = [
    {"role": "user", "content": f"instructions {ID_GENERATION_SYSTEM_PROMPT}\n, question: {INPUT_MSG}"}],
)

generated_response_content = generated_response.choices[0].message.content
generated_response_content

'Investing in a well-diversified portfolio is an essential strategy to manage risk and optimize returns over the long term. The idea behind diversification is to spread your investments across a range of asset types and industries so that your overall portfolio is less affected by the poor performance of a single investment or sector.\n\n### Background on Diversification\nDiversification is one of the fundamental principles of investing. It operates on the idea of not "putting all your eggs in one basket." By holding a mix of different types of investments, you can potentially reduce the impact of volatility and exposure to market downturns. This concept is rooted in the understanding that different asset classes and industries often react differently to the same economic events.\n\n### What Constitutes a Well-Diversified Portfolio?\n\n1. **Variety of Industries**: A well-diversified portfolio should consist of at least 12 different positions from various industries. By doing so, you m

In [None]:
# Video testing
import re

def parse_text_from_timestamps(data: str) -> str:
    """
    Removes timestamps from a given formatted text.
    
    Parameters:
    data (str): The input text with timestamps.
    
    Returns:
    List: The cleaned text without timestamps as a list 
    """
    return re.sub(r'\d{2}:\d{2} - \d{2}:\d{2}: ', '', data).strip().split("\n")

def parse_timestamps(data: str) -> list:
    """
    Extracts timestamps from a given formatted text.
    
    Parameters:
    data (str): The input text with timestamps.
    
    Returns:
    list: A list of extracted timestamps.
    """
    return re.findall(r'\d{2}:\d{2} - \d{2}:\d{2}', data)

# Example usage
data = """
00:00 - 00:06: Under the eerie glow of full moon, Elias stepped cautiously into the abandoned lighthouse.
00:06 - 00:12: It's towering frame groaning against the window the villagers spoke of strange lights flickering inside despite the
"""

clean_text_list = parse_text_from_timestamps(data)
print(clean_text_list)

timestamps = parse_timestamps(data)
print(timestamps)


['Under the eerie glow of full moon, Elias stepped cautiously into the abandoned lighthouse.', "It's towering frame groaning against the window the villagers spoke of strange lights flickering inside despite the"]
['00:00 - 00:06', '00:06 - 00:12']
