In [None]:
%pip install langchain-core langchain-openai python-dotenv

In [10]:
import os
from pprint import pprint
from typing import Any, Dict
from langchain_core.prompts import ChatPromptTemplate

from langchain_openai import ChatOpenAI

from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.rate_limiters import InMemoryRateLimiter
from langchain_core.callbacks import BaseCallbackHandler
from operator import itemgetter
from dotenv import load_dotenv


# Load environment variables from .env file
load_dotenv()

text_contents = []
for filename in os.listdir(".."):
    if filename.endswith(".txt"):
        with open(os.path.join("..", filename), "r") as file:
            text_contents.append(
                {
                    "file_name": os.path.splitext(filename)[0],
                    "file_content": file.read(),
                }
            )

rate_limiter = InMemoryRateLimiter(
    requests_per_second=(
        1 / 5
    ),  # <-- Super slow! We can only make a request once every 5 seconds!!
    check_every_n_seconds=0.1,  # Wake up every 100 ms to check whether allowed to make a request,
    max_bucket_size=5,  # Controls the maximum burst size.
)


class LoggingHandler(BaseCallbackHandler):
    def __init__(self, chain_name: str):
        self.chain_name = chain_name

    def on_chain_start(
        self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs
    ) -> None:
        print(f"Chain {self.chain_name} started.")
        if isinstance(inputs, dict):
            truncated_inputs = {k: (v[:50]) for k, v in inputs.items()}
            pprint(truncated_inputs)
        elif isinstance(inputs, str):
            truncated_outputs = inputs[:50]
            print(truncated_outputs)

    def on_chain_end(self, outputs: Dict[str, Any], **kwargs) -> None:
        print(f"Chain {self.chain_name} ended.")
        if isinstance(outputs, dict):
            truncated_outputs = {k: (v[:50]) for k, v in outputs.items()}
            pprint(truncated_outputs)
        elif isinstance(outputs, str):
            truncated_outputs = outputs[:50]
            print(truncated_outputs)


# Requires env variable OPENAI_API_KEY to be set
mini_model = ChatOpenAI(model="gpt-4o-mini").with_config(
    {"callbacks": [LoggingHandler("gpt-4o-mini")]}
)
model = (
    ChatOpenAI(model="gpt-4o", rate_limiter=rate_limiter)
    .with_config({"callbacks": [LoggingHandler("gpt-4o")]})
    .with_retry(stop_after_attempt=10)
    .with_fallbacks([mini_model])
)

notes_template = ChatPromptTemplate(
    [
        (
            "system",
            """
    Act as an expert teacher in a course.
    You will be given a video transcript from a lesson of the course. The transcript will be provided between triple quotes.
    Create detailed study notes with the contents of the transcript. Include all the contents of the transcript in the notes.
    The notes should be in Markdown format. Follow these rules:
    - The title should be a level 1 header. It should be the name of the lesson. The title should summarize the main topic of the lesson. The title should not contain the text "Study Notes" or "Lesson".
    - Use titles, lists and tables when necessary. 
    - Do not, under any circumstance, add separators (---) between sections.
    - Use bold text delimiters (**) to highlight important concepts
    - Do not, under any circumstance, use bold text delimiters inside the titles and headers.
    - You may add brief additional information to the notes if you think it is necessary. For example, you may add explanations, examples, or references.
    """,
        ),
        (
            "user",
            """
    Transcript:
    \"\"\"
    {input}
    \"\"\"
    """,
        ),
    ]
)


file_name_template = ChatPromptTemplate(
    [
        (
            "system",
            """
    Create the name of the file based on the content of the file. 
    The name of the file should be in snake_case and it should not include the extension. The content of the file will be provided between triple quotes
    """,
        ),
        (
            "user",
            """
    File content:
    \"\"\"
    Example file content
    \"\"\"
    """,
        ),
        ("ai", "example_file_name"),
        (
            "user",
            """
    File content:
    \"\"\"
    {input}
    \"\"\"
    """,
        ),
    ]
)

flashcards_template = ChatPromptTemplate(
    [
        (
            "system",
            """
    Create flashcards based on the contents of the file. The file contains student notes from a course. 
    The file content will be provided between triple quotes. 
    The flashcards should be about specific details of the contents of the file, and also about contents of the file that could appear in an exam.
    Your answer should be a CSV file with two columns. The first column is the question. The second column is the answer. Follow these rules:
    - The CSV separator should be a comma. 
    - The content of each cell should be between double quotes.
    - The content of a cell should not contain double quotes.
    - The first row should not contain the column names: Question and Answer. Skip the first row.
    - Do not include any other commentary or code delimiters (```) in your answer.
    """,
        ),
        (
            "user",
            """
    File content:
    \"\"\"
    {input}
    \"\"\"
    """,
        ),
    ]
)

notes_chain = notes_template | model | StrOutputParser()
file_name_chain = file_name_template | mini_model | StrOutputParser()
flashcards_chain = flashcards_template | model | StrOutputParser()

chain = (
    RunnablePassthrough()
    | {
        "file_name": itemgetter("file_name"),
        "notes": itemgetter("file_content") | notes_chain,
    }
    | {
        "file_name": itemgetter("file_name"),
        "new_file_name": itemgetter("notes")
        | RunnablePassthrough()
        | (lambda x: x[:250])
        | file_name_chain,
        "notes": itemgetter("notes"),
        "flashcards": itemgetter("notes") | flashcards_chain,
    }
    | {
        "notes_file_name": lambda x: f"notes_{x['file_name']}_{x['new_file_name']}.md",
        "flashcards_file_name": lambda x: f"flashcards_{x['file_name']}_{x['new_file_name']}.csv",
        "notes": itemgetter("notes"),
        "flashcards": itemgetter("flashcards"),
    }
)


# results = chain.batch(text_contents, config={"callbacks": [LoggingHandler()]})
results = chain.batch(text_contents)

for result in results:
    with open(os.path.join("..", result["notes_file_name"]), "w") as file:
        file.write(result["notes"])
    with open(os.path.join("..", result["flashcards_file_name"]), "w") as file:
        file.write(result["flashcards"])

Chain started.
Chain started.
Chain ended.
Chain ended.
Chain started.
Chain started.
Chain ended.
Chain ended.
