In [1]:
from langchain.document_loaders import TextLoader

loaders = TextLoader('searchgpt_news.txt')
loaders.load()

[Document(metadata={'source': 'searchgpt_news.txt'}, page_content="A new way to search\n\nGetting answers on the web can take a lot of effort, often requiring multiple attempts to get relevant results. We believe that by enhancing the conversational capabilities of our models with real-time information from the web, finding what youâ€™re looking for can be faster and easier. \n\nDesigned to give you an answer\n\nSearchGPT will quickly and directly respond to your questions with up-to-date information from the web while giving you clear links to relevant sources. \n\nYouâ€™ll be able to ask follow-up questions, like you would in a conversation with a person, with the shared context building with each query. \n\nPartnering with publishers and creators\n\nWe are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been a foundational way for publishers and

In [2]:
type(loaders)

langchain_community.document_loaders.text.TextLoader

In [3]:
loaders.file_path

'searchgpt_news.txt'

In [4]:
!pip3 install unstructured libmagic python-magic python-magic-bin

In [5]:
from langchain.document_loaders import UnstructuredURLLoader

In [6]:
loaders = UnstructuredURLLoader(
    urls=[
        "https://techcrunch.com/2024/08/08/soundhound-acquires-amelia-ai-for-80m-after-it-raised-189m/",
        "https://techcrunch.com/2024/08/07/youtube-is-testing-a-feature-that-lets-creators-use-google-gemini-to-brainstorm-video-ideas/"
    ]
)

In [7]:
data = loaders.load()

In [8]:
len(data)

2

In [9]:
data[0].page_content[:500]

'AI\n\nSoundHound acquires Amelia AI for $80M after it raised $189M+\n\nIngrid Lunden\n\n5:59 AM PDT • August 8, 2024\n\nComment\n\nSoundHound, an AI company that makes voice interface tech used by car companies, restaurants and tech firms, is doubling down on enterprise services by playing consolidator in a crowded market. The company said on Thursday that it is acquiring Amelia AI, which makes an AI agent that businesses can customize for internal or customer use.\n\nSoundHound is paying $80 million in cas'

In [10]:
data[0].metadata

{'source': 'https://techcrunch.com/2024/08/08/soundhound-acquires-amelia-ai-for-80m-after-it-raised-189m/'}

In [11]:
text = """
We are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been a foundational way for publishers and creators to reach users. Now, we’re using AI to enhance this experience by highlighting high quality content in a conversational interface with multiple opportunities for users to engage.

SearchGPT is designed to help users connect with publishers by prominently citing and linking to them in searches. Responses have clear, in-line, named attribution and links so users know where information is coming from and can quickly engage with even more results in a sidebar with source links.

We’ve partnered with publishers to build this experience and continue to seek their feedback. In addition to launching the SearchGPT prototype, we are also launching a way for publishers to manage how they appear in SearchGPT, so publishers have more choices. Importantly, SearchGPT is about search and is separate from training OpenAI’s generative AI foundation models. Sites can be surfaced in search results even if they opt out of generative AI training. To read more about publisher controls and OpenAI’s bots, see here(opens in a new window). 

"""

In [12]:
len(text)

1279

In [13]:
words = text.split(" ")

In [14]:
chunks = []
s = ""
for word in words:
    s += word + " "
    if len(s) > 200:
        chunks.append(s)
        s = ""
chunks.append(s) 

In [15]:
chunks[:2]

['\nWe are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been ',
 'a foundational way for publishers and creators to reach users. Now, we’re using AI to enhance this experience by highlighting high quality content in a conversational interface with multiple opportunities ']

In [16]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size = 200,
    chunk_overlap = 0
)

chunks = splitter.split_text(text)
len(chunks)

Created a chunk of size 425, which is longer than the specified 200
Created a chunk of size 298, which is longer than the specified 200


3

In [17]:
chunks

['We are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been a foundational way for publishers and creators to reach users. Now, we’re using AI to enhance this experience by highlighting high quality content in a conversational interface with multiple opportunities for users to engage.',
 'SearchGPT is designed to help users connect with publishers by prominently citing and linking to them in searches. Responses have clear, in-line, named attribution and links so users know where information is coming from and can quickly engage with even more results in a sidebar with source links.',
 'We’ve partnered with publishers to build this experience and continue to seek their feedback. In addition to launching the SearchGPT prototype, we are also launching a way for publishers to manage how they appear in SearchGPT, so publishers have more choices. Impor

In [18]:
for chunk in chunks:
    print(len(chunk))

425
298
548


In [19]:
len(text.split("\n"))

8

In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n","\n"," "],
    chunk_size = 200,
    chunk_overlap= 0,
    length_function = len
)

In [21]:
chunks = splitter.split_text(text)
len(chunks)

8

In [22]:
chunks

['We are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been',
 'a foundational way for publishers and creators to reach users. Now, we’re using AI to enhance this experience by highlighting high quality content in a conversational interface with multiple',
 'opportunities for users to engage.',
 'SearchGPT is designed to help users connect with publishers by prominently citing and linking to them in searches. Responses have clear, in-line, named attribution and links so users know where',
 'information is coming from and can quickly engage with even more results in a sidebar with source links.',
 'We’ve partnered with publishers to build this experience and continue to seek their feedback. In addition to launching the SearchGPT prototype, we are also launching a way for publishers to manage',
 'how they appear in SearchGPT, so publishers have mor

In [23]:
first_split = text.split("\n\n")[0]
len(first_split)

426

In [24]:
second_split = first_split.split("\n")
len(second_split)

2

In [25]:
second_split

['',
 'We are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been a foundational way for publishers and creators to reach users. Now, we’re using AI to enhance this experience by highlighting high quality content in a conversational interface with multiple opportunities for users to engage.']

In [27]:
for split in second_split:
    print(len(split))

0
425


In [30]:
second_split[1]

'We are committed to a thriving ecosystem of publishers and creators. We hope to help users discover publisher sites and experiences, while bringing more choice to search. For decades, search has been a foundational way for publishers and creators to reach users. Now, we’re using AI to enhance this experience by highlighting high quality content in a conversational interface with multiple opportunities for users to engage.'