# Download Dependencies

In [None]:
pip install -qU langchain-text-splitters tiktoken langchain_experimental langchain_openai langchain-elasticsearch

# Sample passage - https://en.wikipedia.org/wiki/2024_Summer_Olympics

In [None]:
from google.colab import userdata
import os

text = """Paris 2024 featured the debut of breaking as an Olympic sport, and was the final Olympic Games held during the IOC presidency of Thomas Bach. The 2024 Games were expected to cost €9 billion.
The opening ceremony was held outside of a stadium for the first time in modern Olympic history, as athletes were paraded by boat along the Seine. Paris 2024 was the first Olympics in history to reach full gender parity on the field of play, with equal numbers of male and female athletes.
The United States topped the medal table for the fourth consecutive Summer Games and 19th time overall, with 40 gold and 126 total medals. China tied with the United States on gold (40), but finished second due to having fewer silvers; the nation won 91 medals overall.
This is the first time a gold medal tie among the two most successful nations has occurred in Summer Olympic history. Japan finished third with 20 gold medals and sixth in the overall medal count.
Australia finished fourth with 18 gold medals and fifth in the overall medal count.The host nation, France, finished fifth with 16 gold and 64 total medals, and fourth in the overall medal count.
Dominica, Saint Lucia, Cape Verde and Albania won their first-ever Olympic medals, the former two both being gold, with Botswana and Guatemala also winning their first-ever gold medals. The Refugee Olympic Team also won their first-ever medal, a bronze in boxing.
At the conclusion of the games, despite some controversies throughout relating to politics, logistics and conditions in the Olympic Village, the Games were considered a success by the press, Parisians and observers.
The Paris Olympic Games broke all-time records for ticket sales, with the Games selling 9,556,792 tickets (12,132,647 in total for Paris 2024 if you add in the Paralympic Games).
In the 2024 Paris Olympics, several new events and formats have been introduced. Formula Kite made its debut, described as the "Formula One of the Olympics", featuring high-speed foil racing with separate events for men and women.
Kayak cross also debuted, where four athletes race against each other on a course with multiple gates, marking the first head-to-head race in Olympic canoe slalom history.
Sport climbing returned with a new format, splitting into bouldering and lead combined events in addition to a speed event. 3x3 basketball, which debuted in Tokyo, was back with finals scheduled for August 5 at Place de La Concorde.
Changes in other sports included the introduction of men's participation in artistic swimming, a new women's weight class in boxing, and the addition of a marathon race walk mixed relay in track and field.
A TGM Research survey shows that Coca-Cola is globally the most connected brand with the 2024 Olympics, with 23% of people mentioning it. Nike comes in second with 16%, despite not being an official sponsor of the Olympic Games.[197] Belgian beverage company AB InBev became the first Worldwide Olympic Partner during the Games,[198] while two Japanese companies will not renew their sponsorships after 2024; automobile manufacturer Toyota, with the company reportedly unhappy with how the IOC has used its sponsorship money,[199][200] and Panasonic, under continuous management considerations regarding sponsorship, with the company also looking to expand its businesses outside consumer electronics.[201][202]

Under an agreement as "Premium" sponsor reportedly valued at €150 million ($163 million), French luxury goods conglomerate LVMH has been involved in aspects of the Games, with its brand Louis Vuitton having provided the trunks used to store the Olympic torch and medals, and the outfits and trays for medal presenters.
Former IOC marketing head Michael Payne raised concerns that the prominent use of LVMH goods as part of the Olympics (and in particular, the opening ceremony, which also featured the aforementioned items as props, and performers Aya Nakamura and Lady Gaga wearing Dior haute couture) could cause conflicts with other official sponsors, noting that "the direction of stylish sponsor product placement may not be wrong but needs exceptionally careful management.
LVMH got a massive free global ad last night and other partners are all going to be asking, how did that work?
"""

def show_chunks(chunks):
  i = 0
  for c in chunks:
    print(str(i)+" "+c)
    i = i+1

# Recursive Character Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=2,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_text(text)
show_chunks(chunks)

# Token Based Chunking

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size=100,
    chunk_overlap=0,
)

chunks = text_splitter.split_text(text)
show_chunks(chunks)

# Semantic Chunking

In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="interquartile")

chunks = text_splitter.split_text(text)
show_chunks(chunks)

# Push into Elasticsearch

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import ElasticsearchStore

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.create_documents([text])

es = ElasticsearchStore(
    es_cloud_id=userdata.get('ES_CLOUD_ID'),
    es_api_key=userdata.get('ES_API_KEY'),
    index_name="olympic-2024",
    query_field="text",
    vector_query_field="vectors",
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(
        model_id=".elser_model_2_linux-x86_64"
    ),
)

es.client.ingest.put_pipeline(
    id="elser-ingest",
    processors=[
        {
            "inference": {
                "model_id": ".elser_model_2_linux-x86_64",
                "input_output":[
                    {
                        "input_field": "text",
                        "output_field": "vectors"
                    }
                ]
            }
        }
    ],
)

es.client.indices.create(
    index="olympic-2024",
    mappings={
        "properties": {
            "text": {"type": "text"},
            "vectors":{"type":"sparse_vector"},
        }
    },
    settings={"index": {"default_pipeline": "elser-ingest"}},
)

es.from_documents(
    chunks,
    es_cloud_id=userdata.get('ES_CLOUD_ID'),
    es_api_key=userdata.get('ES_API_KEY'),
    index_name="olympic-2024",
    query_field="text",
    vector_query_field="vectors",
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(
        query_model_id=".elser_model_2_linux-x86_64"
    ),
)

es