<img src="https://drive.google.com/uc?export=view&id=1wYSMgJtARFdvTt5g7E20mE4NmwUFUuog" width="200">

[![Gen AI Experiments](https://img.shields.io/badge/Gen%20AI%20Experiments-GenAI%20Bootcamp-blue?style=for-the-badge&logo=artificial-intelligence)](https://github.com/buildfastwithai/gen-ai-experiments)
[![Gen AI Experiments GitHub](https://img.shields.io/github/stars/buildfastwithai/gen-ai-experiments?style=for-the-badge&logo=github&color=gold)](http://github.com/buildfastwithai/gen-ai-experiments)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MGi72aFeRsmgI717Fl_YqApposi4wz3X?usp=sharing)






## Master Generative AI in 8 Weeks
**What You'll Learn:**
- Master cutting-edge AI tools & frameworks
- 6 weeks of hands-on, project-based learning
- Weekly live mentorship sessions

[Start Your Journey](https://www.buildfastwithai.com/genai-course)


#Getting Started with LangExtract: Unlocking Text Analysis with AI

###install requirements

In [None]:
!pip -q install langextract

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

###import API Keys

In [None]:
import os
from google.colab import userdata

os.environ["LANGEXTRACT_API_KEY"] = userdata.get('GOOGLE_API_KEY')
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

###Define Your Extraction Task
First, create a prompt that clearly describes what you want to extract. Then, provide a high-quality example to guide the model.

In [None]:
import textwrap
import langextract as lx

# 1. Define a concise prompt
prompt = textwrap.dedent("""\
Extract characters, emotions, and relationships in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful attributes for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "ROMEO. But soft! What light through yonder window breaks? It is"
            " the east, and Juliet is the sun."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="character",
                extraction_text="ROMEO",
                attributes={"emotional_state": "wonder"},
            ),
            lx.data.Extraction(
                extraction_class="emotion",
                extraction_text="But soft!",
                attributes={"feeling": "gentle awe"},
            ),
            lx.data.Extraction(
                extraction_class="relationship",
                extraction_text="Juliet is the sun",
                attributes={"type": "metaphor"},
            ),
        ],
    )
]

###Run the Extraction
Provide your input text and the prompt materials to the lx.extract function.



In [None]:
# 3. Run the extraction on your input text
input_text = (
    "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"
)
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-pro",
)

[94m[1mLangExtract[0m: model=[92mgemini-2.5-pro[0m, current=[92m68[0m chars, processed=[92m68[0m chars:  [00:12]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m4[0m entities ([1m2[0m unique types)
  [96m•[0m Time: [1m12.94s[0m
  [96m•[0m Speed: [1m5[0m chars/sec
  [96m•[0m Chunks: [1m1[0m





In [None]:
result

AnnotatedDocument(extractions=[Extraction(extraction_class='character', extraction_text='Lady Juliet', char_interval=CharInterval(start_pos=0, end_pos=11), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'emotional_state': 'longing'}), Extraction(extraction_class='emotion', extraction_text='gazed longingly', char_interval=CharInterval(start_pos=12, end_pos=27), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'feeling': 'yearning'}), Extraction(extraction_class='emotion', extraction_text='heart aching', char_interval=CharInterval(start_pos=46, end_pos=58), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={'feeling': 'sorrow'}), Extraction(extraction_class='character', extraction_text='Romeo', char_interval=CharInterval(start_pos=63, end_pos=68), alignme

###Visualize the Results


In [None]:
# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# visualize returns an IPython HTML object
html_object = lx.visualize("extraction_results.jsonl")

# extract the raw HTML string
html_string = html_object.data  # Not str(html_object), but .data

# Get HTML as string
html_string = html_object.data

# Inject custom CSS for better contrast
custom_css = """
<style>
body {
    background-color: white !important;
    color: black !important;
}
.entity.character {
    background-color: #DDEEFF !important;
    color: black !important;
}
.entity.emotion {
    background-color: #C0F5C0 !important;
    color: black !important;
}
.entity.relationship {
    background-color: #FFEDB3 !important;
    color: black !important;
}
</style>
"""

# Prepend the CSS to the HTML string
patched_html = custom_css + html_string

# Save patched HTML
with open("visualization.html", "w") as f:
    f.write(patched_html)

# Display
from IPython.display import HTML
HTML(patched_html)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 979.06 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 1.17k/1.17k [00:00<00:00, 4.24MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m





### Custom example

In [None]:
ARTICLE = """OpenAI’s Sam Altman teases GPT-5 as ChatGPT platform nears 700 million active users
By: Tech Desk

After months of wait, GPT‑5 may have made its first public appearance. On Sunday, August 3, OpenAI CEO Sam Altman posted a screenshot on X showing an interaction with the highly anticipated large language model (LLM).

The post showed Altman asking GPT‑5, ‘What is the most thought‑provoking show about AI?’ to which the model responded with high praise for the science fiction series Pantheon.

turns out yes! pic.twitter.com/yVsZXKSmKR
— Sam Altman (@sama) August 3, 2025

According to GPT‑5, Pantheon is “cerebral, emotional, and philosophically intense” and holds a “100% critic rating on Rotten Tomatoes.”

Also read | Can’t afford ChatGPT Plus? OpenAI may launch a cheaper Go plan

GPT‑5 is said to have a larger context window than its predecessors. As a result, it is capable of handling more agentic tasks and comes with multimodal capabilities. GPT‑5 is also widely rumoured to be adept at coding.

The upcoming launch of GPT‑5 comes at a time when OpenAI is facing significant pressure from its competitors who have launched new iterations of AI models over the past few months such as Google’s Gemini 2.5 Pro, XAI’s Grok 4, and Anthropic’s Claude Opus 4.

However, ChatGPT continues to witness substantial growth with the AI chatbot platform on track to reach 700 million weekly active users this week, according to Nick Turley, a vice president at OpenAI and head of the ChatGPT app.

This week, ChatGPT is on track to reach 700M weekly active users — up from 500M at the end of March and 4× since last year. Every day, people and teams are learning, creating, and solving harder problems. Big week ahead. Grateful to the team for making ChatGPT more useful and…
— Nick Turley (@nickaturley) August 4, 2025

Its popularity surged in March this year after OpenAI integrated an image generation feature into ChatGPT that kicked off the viral Ghibli trend on social media.

Also Read | OpenAI’s new image generator for ChatGPT triggers Ghibli fest: 10 best posts on X

OpenAI chief operating officer Brad Lightcap stated that more than 130 million users generated over 700 million images after launching the image generation feature. Its active user base grew to 500 million that month.

Furthermore, the number of ChatGPT subscribers has also increased. According to Lightcap, paying corporate customers rose from 3 million in June to 5 million last week.

ChatGPT users utilise the app more than 12 days each month on average, ranking second only to Google and X, according to data from Sensor Tower. It also found that users spent an average of 16 minutes per day on the AI chatbot app in the first half of 2025."""



###Define Your Extraction Task
First, create a prompt that clearly describes what you want to extract. Then, provide a high-quality example to guide the model.

In [None]:
# 1. Define a concise prompt
prompt = textwrap.dedent("""\
Extract people's name, ai models, products and company names in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful related entities for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "David Ha from Sakana AI labs has trained many models"
            " including the early 'WM1' and his company makes a product called 'AI Scientist' ."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="person_name",
                extraction_text="David Ha",
                attributes={"company": "Sakana AI"},
            ),
            lx.data.Extraction(
                extraction_class="company_name",
                extraction_text="Sakana AI",
                attributes={"employee": "David Ha"},
            ),
            lx.data.Extraction(
                extraction_class="ai_model",
                extraction_text="WM1",
                attributes={"company": "Sakana AI"},
            ),
            lx.data.Extraction(
                extraction_class="product",
                extraction_text="'AI Scientist'",
                attributes={"company": "Sakana AI"},
            ),
        ],
    )
]



###Run the Extraction
Provide your input text and the prompt materials to the lx.extract function.



In [None]:
# 3. Run the extraction on your input text
input_text = (
    ARTICLE
)
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.0-flash",
)

[94m[1mLangExtract[0m: model=[92mgemini-2.0-flash[0m, current=[92m2,721[0m chars, processed=[92m2,721[0m chars:  [00:09]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m27[0m entities ([1m4[0m unique types)
  [96m•[0m Time: [1m9.57s[0m
  [96m•[0m Speed: [1m285[0m chars/sec
  [96m•[0m Chunks: [1m3[0m





In [None]:
result

AnnotatedDocument(extractions=[Extraction(extraction_class='company_name', extraction_text='OpenAI', char_interval=CharInterval(start_pos=0, end_pos=6), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'employee': 'Sam Altman'}), Extraction(extraction_class='person_name', extraction_text='Sam Altman', char_interval=CharInterval(start_pos=9, end_pos=19), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'company': 'OpenAI'}), Extraction(extraction_class='ai_model', extraction_text='GPT-5', char_interval=CharInterval(start_pos=27, end_pos=32), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={'company': 'OpenAI'}), Extraction(extraction_class='product', extraction_text='ChatGPT', char_interval=CharInterval(start_pos=36, end_pos=43), alignment_status=<Alignm

###Visualize the Results


In [None]:
# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# visualize returns an IPython HTML object
html_object = lx.visualize("extraction_results.jsonl")

# extract the raw HTML string
html_string = html_object.data  # Not str(html_object), but .data

# Get HTML as string
html_string = html_object.data

# Inject custom CSS for better contrast
custom_css = """
<style>
body {
    background-color: white !important;
    color: black !important;
}
.entity.character {
    background-color: #DDEEFF !important;
    color: black !important;
}
.entity.emotion {
    background-color: #C0F5C0 !important;
    color: black !important;
}
.entity.relationship {
    background-color: #FFEDB3 !important;
    color: black !important;
}
</style>
"""

# Prepend the CSS to the HTML string
patched_html = custom_css + html_string

# Save patched HTML
with open("visualization.html", "w") as f:
    f.write(patched_html)

# Display
from IPython.display import HTML
HTML(patched_html)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 353.98 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 9.71k/9.71k [00:00<00:00, 9.52MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m





### Print out the people mentioned

In [None]:
for ex in result.extractions:
    if ex.extraction_class == "person_name":
        print(ex.extraction_class)
        print(ex.extraction_text)
        print(ex.attributes)
        print(ex.char_interval)
        print("====================")

person_name
Sam Altman
{'company': 'OpenAI'}
CharInterval(start_pos=9, end_pos=19)
person_name
Nick Turley
{'company': 'OpenAI'}
CharInterval(start_pos=1427, end_pos=1438)
person_name
Brad Lightcap
{'company': 'OpenAI'}
CharInterval(start_pos=2109, end_pos=2122)


## Companies mentioned

In [None]:
unique_companies = set()
for ex in result.extractions:
    if ex.extraction_class == "company_name":
        unique_companies.add(ex.extraction_text)

for ex in result.extractions:
    if ex.extraction_class == "company_name":
        print(ex.extraction_text)
        print(ex.char_interval)
        print("====================")

print(unique_companies)

OpenAI
CharInterval(start_pos=0, end_pos=6)
OpenAI
CharInterval(start_pos=1058, end_pos=1064)
Google
CharInterval(start_pos=1196, end_pos=1202)
XAI
CharInterval(start_pos=1221, end_pos=1224)
Anthropic
CharInterval(start_pos=1239, end_pos=1248)
OpenAI
CharInterval(start_pos=1867, end_pos=1873)
Google
CharInterval(start_pos=2561, end_pos=2567)
X
CharInterval(start_pos=2572, end_pos=2573)
{'Anthropic', 'OpenAI', 'XAI', 'X', 'Google'}


##Using OpenAI Models


###Define Your Extraction Task
First, create a prompt that clearly describes what you want to extract. Then, provide a high-quality example to guide the model.

In [None]:
import langextract as lx

# Text with interleaved medication mentions
input_text = """
The patient was prescribed Lisinopril and Metformin last month.
He takes the Lisinopril 10mg daily for hypertension, but often misses
his Metformin 500mg dose which should be taken twice daily for diabetes.
"""

# Define extraction prompt
prompt_description = """
Extract medications with their details, using attributes to group related information:

1. Extract entities in the order they appear in the text
2. Each entity must have a 'medication_group' attribute linking it to its medication
3. All details about a medication should share the same medication_group value
"""

# Define example data with medication groups
examples = [
    lx.data.ExampleData(
        text="Patient takes Aspirin 100mg daily for heart health and Simvastatin 20mg at bedtime.",
        extractions=[
            # First medication group
            lx.data.Extraction(
                extraction_class="medication",
                extraction_text="Aspirin",
                attributes={"medication_group": "Aspirin"}  # Group identifier
            ),
            lx.data.Extraction(
                extraction_class="dosage",
                extraction_text="100mg",
                attributes={"medication_group": "Aspirin"}
            ),
            lx.data.Extraction(
                extraction_class="frequency",
                extraction_text="daily",
                attributes={"medication_group": "Aspirin"}
            ),
            lx.data.Extraction(
                extraction_class="condition",
                extraction_text="heart health",
                attributes={"medication_group": "Aspirin"}
            ),

            # Second medication group
            lx.data.Extraction(
                extraction_class="medication",
                extraction_text="Simvastatin",
                attributes={"medication_group": "Simvastatin"}
            ),
            lx.data.Extraction(
                extraction_class="dosage",
                extraction_text="20mg",
                attributes={"medication_group": "Simvastatin"}
            ),
            lx.data.Extraction(
                extraction_class="frequency",
                extraction_text="at bedtime",
                attributes={"medication_group": "Simvastatin"}
            )
        ]
    )
]


###Run the Extraction
Provide your input text and the prompt materials to the lx.extract function.



In [None]:
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    language_model_type=lx.inference.OpenAILanguageModel,
    model_id="gpt-4o",
    api_key=os.environ.get('OPENAI_API_KEY'),
    fence_output=True,
    use_schema_constraints=False
)

[94m[1mLangExtract[0m: model=[92mgpt-4o[0m, current=[92m206[0m chars, processed=[92m206[0m chars:  [00:02]

[92m✓[0m Extraction processing complete
[92m✓[0m Extracted [1m8[0m entities ([1m4[0m unique types)
  [96m•[0m Time: [1m2.83s[0m
  [96m•[0m Speed: [1m73[0m chars/sec
  [96m•[0m Chunks: [1m1[0m





In [None]:
result

AnnotatedDocument(extractions=[Extraction(extraction_class='medication', extraction_text='Lisinopril', char_interval=CharInterval(start_pos=28, end_pos=38), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=1, group_index=0, description=None, attributes={'medication_group': 'Lisinopril'}), Extraction(extraction_class='medication', extraction_text='Metformin', char_interval=CharInterval(start_pos=43, end_pos=52), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=2, group_index=1, description=None, attributes={'medication_group': 'Metformin'}), Extraction(extraction_class='dosage', extraction_text='10mg', char_interval=CharInterval(start_pos=89, end_pos=93), alignment_status=<AlignmentStatus.MATCH_EXACT: 'match_exact'>, extraction_index=3, group_index=2, description=None, attributes={'medication_group': 'Lisinopril'}), Extraction(extraction_class='frequency', extraction_text='daily', char_interval=CharInterval(start_pos=94, end_

###Visualize the Results


In [None]:
# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# visualize returns an IPython HTML object
html_object = lx.visualize("extraction_results.jsonl")

# extract the raw HTML string
html_string = html_object.data  # Not str(html_object), but .data

# Get HTML as string
html_string = html_object.data

# Inject custom CSS for better contrast
custom_css = """
<style>
body {
    background-color: white !important;
    color: black !important;
}
.entity.character {
    background-color: #DDEEFF !important;
    color: black !important;
}
.entity.emotion {
    background-color: #C0F5C0 !important;
    color: black !important;
}
.entity.relationship {
    background-color: #FFEDB3 !important;
    color: black !important;
}
</style>
"""

# Prepend the CSS to the HTML string
patched_html = custom_css + html_string

# Save patched HTML
with open("visualization.html", "w") as f:
    f.write(patched_html)

# Display
from IPython.display import HTML
HTML(patched_html)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 566.26 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 2.38k/2.38k [00:00<00:00, 3.72MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m



