# Topic Modelling With Language Models

In [26]:
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import create_extraction_chain

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
import pinecone

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

import os
from dotenv import load_dotenv

load_dotenv()

True

### Create the LLMs and get data

In [3]:
llm3 = ChatOpenAI(
    temperature=0,
    openai_api_key=os.getenv('OPENAI_API_KEY'),
    model_name='gpt-3.5-turbo-0613',
    request_timeout=180
)

llm4 = ChatOpenAI(
    temperature=0,
    openai_api_key=os.getenv('OPENAI_API_KEY'),
    model_name='gpt-4-0613',
    request_timeout=180
)

In [6]:
transcipt_paths = [
    './transcripts/mfm_pod_steph.txt',
    './transcripts/mfm_pod_alex.txt',
    './transcripts/mfm_pod_rob.txt'
]

with open('./transcripts/mfm_pod_steph.txt') as file:
    transcript = file.read()

In [8]:
print(transcript[:200])

Shaan Puri (0:00:00-0:00:03): D to see hearing AIDS. I think that's actually going to be a big deal. 

Sam Parr (0:00:03-0:00:05): And they're profitable. 

Shaan Puri (0:00:05-0:00:08): I mean, I'm j


Then we are going to split our text up into chunks. We do this so:

1. The context size is smaller and the LLM can increase it's attention to context ratio
2. In case the text is too long and it wouldn't fit in the prompt anyway

In [10]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", " "], chunk_size=10000, chunk_overlap=2200)

transcript_subsection_charactetrs = 23250
docs = text_splitter.create_documents([transcript[:transcript_subsection_charactetrs]])
print (f"You have {len(docs)} docs. First doc is {llm3.get_num_tokens(docs[0].page_content)} tokens")

You have 3 docs. First doc is 2801 tokens


### Part 1 - Extract Topic Titles and Short Description

In [11]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- Your goal is to extract the topic names and brief 1-sentence description of the topic
- Topics include:
  - Themes
  - Business Ideas
  - Interesting Stories
  - Money making businesses
  - Quick stories about people
  - Mental Frameworks
  - Stories about an industry
  - Analogies mentioned
  - Advice or words of caution
  - Pieces of news or current events
- Provide a brief description of the topics after the topic name. Example: 'Topic: Brief Description'
- Use the same words and terminology that is said in the podcast
- Do not respond with anything outside of the podcast. If you don't see any topics, say, 'No Topics'
- Do not respond with numbers, just bullet points
- Do not include anything about 'Marketing Against the Grain'
- Only pull topics from the transcript. Do not use the examples
- Make your titles descriptive but concise. Example: 'Shaan's Experience at Twitch' should be 'Shaan's Interesting Projects At Twitch'
- A topic should be substantial, more than just a one-off comment

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
 - Revenge Against The Spam Calls: A couple of businesses focused on protecting consumers: RoboCall, TrueCaller, DoNotPay, FitIt
 - Wildcard CEOs vs. Prudent CEOs: However, Munger likes to surround himself with prudent CEO’s and says he would never hire Musk.
 - Chess Business: Priyav, a college student, expressed his doubts on the MFM Facebook group about his Chess training business, mychesstutor.com, making $12.5K MRR with 90 enrolled.
 - Restaurant Refiller: An MFM Facebook group member commented on how they pay AirMark $1,000/month for toilet paper and toilet cover refills for their restaurant. Shaan sees an opportunity here for anyone wanting to compete against AirMark.
 - Collecting: Shaan shared an idea to build a mobile only marketplace for a collectors’ category; similar to what StockX does for premium sneakers.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_map = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [12]:
template="""
You are a helpful assistant that helps retrieve topics talked about in a podcast transcript
- You will be given a series of bullet topics of topics vound
- Your goal is to exract the topic names and brief 1-sentence description of the topic
- Deduplicate any bullet points you see
- Only pull topics from the transcript. Do not use the examples

% START OF EXAMPLES
 - Sam’s Elisabeth Murdoch Story: Sam got a call from Elizabeth Murdoch when he had just launched The Hustle. She wanted to generate video content.
 - Shaan’s Rupert Murdoch Story: When Shaan was running Blab he was invited to an event organized by Rupert Murdoch during CES in Las Vegas.
% END OF EXAMPLES
"""
system_message_prompt_map = SystemMessagePromptTemplate.from_template(template)

human_template="Transcript: {text}" # Simply just pass the text as a human message
human_message_prompt_map = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt_combine = ChatPromptTemplate.from_messages(messages=[system_message_prompt_map, human_message_prompt_map])

In [15]:
chain = load_summarize_chain(
    llm3,
    chain_type='map_reduce',
    map_prompt=chat_prompt_map,
    combine_prompt=chat_prompt_combine
)

In [16]:
topics_found = chain.run({'input_documents': docs})

In [17]:
print(topics_found)

- Fractional real estate: Taking advantage of the high vacancy rates in commercial real estate by converting unused spaces into different types of businesses, such as yoga studios, Pilates studios, rage rooms, etc. (Converting unused spaces into different types of businesses)
- Temple Immersive: A club in San Francisco that transforms into a yoga class during the day and a nightclub during the night, utilizing unused real estate. (Club in San Francisco that transforms into a yoga class during the day and a nightclub during the night)
- Rage rooms: Rooms where people can go to release their anger and frustration by smashing objects like plates and electronics. (Rooms where people can release anger and frustration by smashing objects)
- Escape rooms: A business idea where participants are locked in a room and must solve puzzles and find clues to escape within a certain time limit. Raleigh Williams created an escape room business and sold it for $26 million. (Business idea where participa

In [18]:
schema = {
    "properties": {
        # The title of the topic
        "topic_name": {
            "type": "string",
            "description" : "The title of the topic listed"
        },
        # The description
        "description": {
            "type": "string",
            "description" : "The description of the topic listed"
        },
        "tag": {
            "type": "string",
            "description" : "The type of content being described",
            "enum" : ['Business Models', 'Life Advice', 'Health & Wellness', 'Stories']
        }
    },
    "required": ["topic", "description"],
}

In [19]:
chain = create_extraction_chain(schema, llm3)

In [20]:
topics_structured = chain.run(topics_found)

In [21]:
topics_structured

[{'topic_name': 'Fractional real estate',
  'description': 'Taking advantage of the high vacancy rates in commercial real estate by converting unused spaces into different types of businesses, such as yoga studios, Pilates studios, rage rooms, etc.',
  'tag': 'Business Models'},
 {'topic_name': 'Temple Immersive',
  'description': 'A club in San Francisco that transforms into a yoga class during the day and a nightclub during the night, utilizing unused real estate.',
  'tag': 'Business Models'},
 {'topic_name': 'Rage rooms',
  'description': 'Rooms where people can go to release their anger and frustration by smashing objects like plates and electronics.',
  'tag': 'Health & Wellness'},
 {'topic_name': 'Escape rooms',
  'description': 'A business idea where participants are locked in a room and must solve puzzles and find clues to escape within a certain time limit. Raleigh Williams created an escape room business and sold it for $26 million.',
  'tag': 'Business Models'},
 {'topic_na

### Part 2 - Expand on the topics we found

In [22]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=800)

docs = text_splitter.create_documents([transcript[:transcript_subsection_charactetrs]])

print (f"You have {len(docs)} docs. First doc is {llm3.get_num_tokens(docs[0].page_content)} tokens")

You have 8 docs. First doc is 776 tokens


For doing Question & Answer Retrieval, we need to get embeddings for our documents so we can pull out the docs which are similar for context later.

In [23]:
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))

In [24]:
pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment=os.getenv('PINECONE_ENV')
)

In [27]:
index_name = 'langchaintest'

docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)

In [28]:
system_template = """
You will be given text from a podcast transcript which contains many topics.
You goal is to write a summary (5 sentences or less) about a topic the user chooses
Do not respond with information that isn't relevant to the topic that the user gives you
----------------
{context}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]

# This will pull the two messages together and get them ready to be sent to the LLM through the retriever
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [29]:
qa = RetrievalQA.from_chain_type(llm=llm3,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever(k=4),
                                 chain_type_kwargs = {
                                     'prompt': CHAT_PROMPT
                                 })

In [30]:
for topic in topics_structured[:5]:
    query = f"""
        {topic['topic_name']}: {topic['description']}
    """

    expanded_topic = qa.run(query)

    print(f"{topic['topic_name']}: {topic['description']}")
    print(expanded_topic)
    print ("\n\n")

Fractional real estate: Taking advantage of the high vacancy rates in commercial real estate by converting unused spaces into different types of businesses, such as yoga studios, Pilates studios, rage rooms, etc.
One interesting opportunity in the current commercial real estate market is the concept of fractional real estate. With high vacancy rates in office buildings, there is potential to convert these unused spaces into different types of businesses. This could include transforming a club into a yoga studio or a rage room, where people can go to release their anger by smashing things. By repurposing these spaces, it allows for more efficient use of real estate and the potential for new and unique businesses to thrive.



Temple Immersive: A club in San Francisco that transforms into a yoga class during the day and a nightclub during the night, utilizing unused real estate.
Temple Immersive is a club in San Francisco that takes advantage of unused real estate by transforming into a 

### Part 3 - Chapters with Timestamps

In [31]:
system_template = """
What is the first timestamp when the speakers started talking about a topic the user gives?
Only respond with the timestamp, nothing else. Example: 0:18:24
----------------
{context}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [32]:
qa = RetrievalQA.from_chain_type(llm=llm3,
                                 chain_type="stuff",
                                 retriever=docsearch.as_retriever(k=4),
                                 chain_type_kwargs = {
                                     'prompt': CHAT_PROMPT
                                 })

In [33]:
topic_timestamps = []

for topic in topics_structured:

    query = f"{topic['topic_name']} - {topic['description']}"
    timestamp = qa.run(query)
    
    topic_timestamps.append(f"{timestamp} - {topic['topic_name']}")

In [34]:
print ("\n".join(sorted(topic_timestamps)))

0:12:32 - Fractional real estate
0:13:08 - Temple Immersive
0:16:11 - Rage rooms
0:17:09 - Escape rooms
0:17:09 - Out-of-home entertainment
