In [None]:
import os
import glob
import pandas as pd
import json
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.tools import Tool
from langchain.llms.base import LLM
from langchain import PromptTemplate, LLMChain
from langchain.agents import load_tools, initialize_agent, AgentExecutor, BaseSingleActionAgent, AgentType
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
os.chdir("/notebooks/learn-langchain")
from langchain_app.models.text_generation_web_ui import (
    build_text_generation_web_ui_client_llm,
)
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.callbacks import AimCallbackHandler, StdOutCallbackHandler

# Суммаризация текста

# Суммаризация текста без использования метода Summary

In [None]:
llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
    
    })

template = """
Write a concise summary of the following:

{question}

CONCISE SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

question = """

Updates to Fraud and Consumer Dispute Rules
Global (excluding Brazil) | Acquirers, Issuers, Processors, Agents Visa, Interlink, Plus Networks; V PAY
Overview: Visa has updated dispute rule language and made additional revisions to dispute rules for clarity and consistency based on client feedback. The updated rules will be effective for disputes processed on or after 14 October 2023, unless otherwise specified.
To promote more efficient dispute resolution for clients, Visa is eliminating rule language that is outdated or no longer required, standardizing rules to make them easier to interpret and use and making modifications to increase flexibility.
As a result, the following changes are being made to the Visa Rules sections
below effective for disputes processed on or after 14 October 2023, unless
otherwise specified. Refer to the advance copies of the changes in the Additional Resources section below for more information.
• Use of Compelling Evidence / Allowable Compelling Evidence (disputes involving Europe region)
• Pre-Arbitration Processing Requirements for Dispute Condition 10.1: EMV Liability Shift—Counterfeit Fraud, Dispute Condition 10.2: EMV Liability Shift—Non-Counterfeit Fraud, Dispute Condition 10.3: Other Fraud— Card-Present Environment and Dispute Condition 10.4: Other Fraud—Card-Absent Environment
• Dispute Condition 10.4: Other Fraud—Card-Absent Environment
• Dispute Reasons for Dispute Condition 12.6: Duplicate Processing / Paid By Other Means
• Invalid Disputes for Dispute Condition 13.3: Not as Described or Defective Merchandise / Services
In addition, Visa has updated the rules language related to dispute / pre-arbitration processing requirements and supporting documentation / certification, and has also made miscellaneous rule updates to the following dispute conditions and compliance processing requirements:
• Issuer Responsibilities to Cardholders for Dispute Resolution
• Allowable Compelling Evidence
• Minimum Dispute Amounts
      Mark Your Calendar:
• Updated dispute rule language effective (14 October 2023)
 Article ID: AI12944
• Invalid Disputes for Dispute Condition 10.3: Other Fraud—Card-Present Environment
• Invalid Disputes for Dispute Condition 10.4: Other Fraud—Card-Absent Environment
• Dispute Condition 13.7: Cancelled Merchandise / Services
Allowable Compelling Evidence (Disputes Involving Europe Region)
Currently an acquirer in the Europe region is allowed to present compelling evidence that is outside of the Allowable Compelling Evidence list. To alleviate confusion and streamline the process, the Europe region will align with the rest of the world. Effective for pre-arbitration attempts processed on or after 15 October 2023, only the items in Table 11.6—Allowable Compelling Evidence in the Visa Rules will qualify as compelling evidence (ID#: 0030221).
Pre-Arbitration Processing Requirements for Dispute Conditions 10.1, 10.2, 10.3 and 10.4
The Visa Rules allow certain delayed transactions, such as a transaction related to beverages from the mini fridge that occurred during a hotel stay or trip, or a parking violation that occurred while a cardholder was renting a car. Because these charges are billed after the cardholder’s departure or car rental return, an imprint for these charges cannot be obtained. However, an imprint obtained at any time during the stay / car rental (including at the time of check-in or vehicle check-out) can be used to demonstrate that the cardholder participated. Therefore, effective for pre-arbitration attempts processed on or after 14 October 2023, for a delayed transaction, the acquirer may supply evidence that the transaction relates to a prior stay, trip or rental period and evidence that an imprint was obtained during the same stay, trip or rental period to support their pre-arbitration attempt.
Dispute Condition 10.4: Other Fraud—Card-Absent Environment
Currently the Canada Domestic, U.S. Domestic and UK Domestic segments each have separate rules pertaining to the Address Verification Service (AVS). To align AVS rules and simplify the dispute process in these three countries, effective for pre-arbitration attempts processed on or after 14 October 2023, a dispute will not be allowed when the transaction received an authorization and the acquirer attempted to authenticate the cardholder through AVS and received a result code of U, unless the transaction was attempted with a Visa Commercial card or a card type where the cardholder is anonymous.
As announced in the 16 June 2022 edition of the Visa Business News, the new remedy for Dispute Condition 10.4: Other Fraud—Card-Absent Environment requires an acquirer to provide a detailed description of the merchandise or services purchased. In an effort to provide the issuer with as much information / evidence to demonstrate cardholder participation, effective for pre-arbitration attempts processed on or after 14 October 2023, the acquirer will be required to supply a detailed description of the merchandise or services purchased for both the disputed transactions and the two previous undisputed transactions.

"""
llm_chain.run(question)

# Testing different parameters of the model for text summarization

In [None]:
# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Create an instance of LLM for generating a summary based on a 200 token text Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
    "do_sample": True,
    "temperature": 0.7,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.18,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
})

# Create an instance of a text splitter into 2500 characters with an overlap of 0 characters (for summary generation)
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Create a chain for generating a summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce", verbose=True)

# Create an empty list to save the results
results = []

for file_path in pdf_files:
    # Load PDF file
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #  1. GENERATE MEDIUM SIZE SUMMARY

    # Split text into chunks of 2500 characters
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary
    docs_med_sum = chain_med_sum.run(docs)

    # Save results to list
    results.append(docs_med_sum)

# Print all results
for result in results:
    print(result)

# Testing different parameters of the model for extracting JSON

In [None]:
# Initialize the DataFrame with required columns
df = pd.DataFrame(columns=[
    "effective_date", 
    "mark_your_calendar_date", 
    "article_id", 
    "category", 
    "title", 
    "geo", 
    "audience", 
    "publication_date"
])

# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Get the first PDF file path from the list
for file_path in pdf_files:

    # Create an instance of LLM for generating a JSON extract
    llm_extract_json = build_text_generation_web_ui_client_llm(
        parameters={
        "max_new_tokens": 100,
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.4,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 40,
        "min_length": 0,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
    }
    )

    # Create an instance of a text splitter to extract JSON with 5 fields
    text_extract_json_5 = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=0
    )

    # Create an instance of a text splitter to extract JSON with 3 fields
    text_extract_json_3 = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=0
    )

    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    # Create a template for the prompt extract to JSON with 3 fields
    template_json_3 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input.
    1. Effective date
    2. Mark your calendar date
    3. Article ID

    Format instructions: JSON
    If there is no information in the text: empty string
    The output looks like this:
    {output3}
    
    ### Input: 
    {question}

    ### Response:
    """

    # Create a template for the prompt extract to JSON with 5 fields
    template_json_5 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input.
    1. Category
    2. Title
    3. GEO
    4. Audience
    5. Publication date
    
    Format instructions: JSON
    If there is no information in the text: empty string
    # Input example: 
    Visa Business News
    Visa Rules 13 April 2023 Introduction of Visa Rules Waiver Extension Assessments
    AP, CEMEA, Europe, LAC | Acquirers, Issuers, Processors, Agents Visa, Interlink Networks; V PAY
    Overview: Effective 14 October 2023, Visa will introduce waiver extension assessments for approved Visa Rules waiver extensions.
    The Visa Rules are designed to minimize risks and provide a common, convenient
    # Response example:
    {output5}
    
    ### Input: 
    {question}

    ### Response:
    """

    # Create sub-templates for the chain
    output3 = """
    {"effective_date": "yyyy-mm-dd", "mark_your_calendar_date": "yyyy-mm-dd", "article_id": ""}
    """
    output5 = """
    {"category": "Visa Rules", "title": "Introduction of Visa Rules Waiver Extension Assessments", "geo": "AP, CEMEA, Europe, LAC", "audience": "Acquirers, Issuers, Processors, Agents", "publication_date": "2023-04-13"}
    """

    # Create a prompt
    prompt_json_5 = PromptTemplate(template=template_json_5, input_variables=['question', 'output5'])
    prompt_json_3 = PromptTemplate(template=template_json_3, input_variables=['question', 'output3'])

    # Create a chain for generating a JSON extract
    llm_chain_json_5 = LLMChain(llm=llm_extract_json, prompt=prompt_json_5)
    llm_chain_json_3 = LLMChain(llm=llm_extract_json, prompt=prompt_json_3)

    docs_json_5 = text_extract_json_5.create_documents([text])
    docs_json_3 = text_extract_json_3.create_documents([text])

    question = docs_json_5[0].page_content

    json_5 = json.loads(llm_chain_json_5.run({'question': question, 'output5': output5}))

    question = docs_json_3[0].page_content

    json_3 = json.loads(llm_chain_json_3.run({'question': question, 'output3': output3}))

    print(json_5, json_3)

    combined_json = {**json_3, **json_5}

    # Используем метод from_records для создания DataFrame из списка словарей
    new_row = pd.DataFrame.from_records([combined_json])

    # Добавляем новую строку к существующему DataFrame
    df = pd.concat([df, new_row], ignore_index=True)

# Print the DataFrame
df

In [None]:
print(json_3)

In [None]:
df

--------------------

# Суммаризация PDF файлов

## Суммаризация текста и сохранение результатов в текстовый файл

In [None]:
# Определяем путь к папке с PDF-файлами
input_directory = "/notebooks/files"

# Получаем список PDF-файлов
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Создаем инстанс LLM для генерации summary на основе текста размером 200 токенов Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.7,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Создаем инстанс LLM для получения Article ID
llm_id = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 10,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Создаем инстанс LLM для получения Focus audience
llm_focus = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 30,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Создаем инстанс разделителя текста на 2500 символов с перекрытием в 0 символов (для генерации summary)
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Создаем инстанс разделителя текста на 50 символов с перекрытием в 0 символов (для генерации Article ID)
text_splitter_id = RecursiveCharacterTextSplitter(
    chunk_size=20,
    chunk_overlap=0
)

# Создаем инстанс разделителя текста на 300 символов с перекрытием в 0 символов (для генерации focus audience)
text_splitter_focus = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=0
)


# Создаем template для и запрос для сбора значений Focus audience
# template_focus = """
# Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# Extract a list of ecommerce entities, which mentioned separated by commas in the provided text, after the article headline. EXAMPLES: Acquirers, Processors, Issuers, Agents. In the response provide only focus audience separated by comma and nothing more.

# ### Input: {question}

# ### Response:
# """

# prompt_focus = PromptTemplate(template=template_focus, input_variables=["question"])

template_focus = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Extract from headline of an article the following information: 1. Article title 2. List of ecommerce entities, which mentioned separated by commas. Provide information in tabular form EXAMPLE: 
| **Article Headline** | **List of Ecommerce Entities** |
| Article title        | Acquirers, Processors, Issuers, Agents |

### Input: {question}

### Response:
"""

prompt_focus = PromptTemplate(template=template_focus, input_variables=["question"])

# Создаем template для и запрос для сбора значений Article ID
template_id = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Find an alphanumeric combination, following after exact phrase:'Article ID:'. In the response provide only alphanumeric combination and nothing more.

### Input: {question}

### Response:
"""

prompt_id = PromptTemplate(template=template_id, input_variables=["question"])



# Создаем цепочку для генерации summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce", verbose=True)

# Создаем цепочку для генерации Article ID
chain_id = LLMChain(prompt=prompt_id, llm=llm_id)

# Создаем цепочку для генерации Focus Audience
chain_focus = LLMChain(prompt=prompt_focus, llm=llm_focus)

for file_path in pdf_files:
    # Загружаем PDF-файл
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #1. ГЕНЕРАЦИЯ MEDIAN SIZE SUMMARY

    #Разрезаем текст на куски по 2500 символов 
    docs = text_splitter_med_sum.create_documents([text])
    # Отправляем документы в обработку для генерации summary
    docs_med_sum = chain_med_sum.run(docs)

    # Заменяем расширение файла на .txt и сохраняем результат
    output_file_path = os.path.splitext(file_path)[0] + ".txt"
    with open(output_file_path, "w") as output_file:
        output_file.write(docs_med_sum)

    # # 2. ГЕНЕРАЦИЯ ARTICLE ID
    # docs = text_splitter_id.create_documents([text])
    # question = docs[-1]
    # # Отправляем в обработку для генерации Article ID
    # article_id = chain_id.run(question)

    # # 3. ГЕНЕРАЦИЯ ЦЕЛЕВОЙ АУДИТОРИИ
    # docs = text_splitter_focus.create_documents([text])
    # question = docs[0]
    # # Отправляем в обработку для генерации Article ID
    # focus = chain_focus.run(question)

    print(f"Обработан файл: {file_path}")
    print(f"Article ID: {article_id}")
    print(f"Focus audience: {focus}")


In [None]:
# Берем PDF документ и делаем из него текстовый файл с помощью langchain.document_loaders.PDFMinerLoader
file_path = '/notebooks/files/AI12944 - Updates to Fraud and Consumer Dispute Rules.pdf'
loader = PDFMinerLoader(file_path)
document = loader.load()

text = document[0].page_content

llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
    })


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

chain = load_summarize_chain(llm, chain_type="map_reduce", verbose = True)
chain.run(docs)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1700,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

content = docs[0].page_content
print(content)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

content = docs[-1].page_content

llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 10,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 1,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})


template = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Find an alphanumeric combination, following after exact phrase:'Article ID:'. In the response provide only alphanumeric combination and nothing more.

### Input: {question}


### Response:
"""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

llm_chain.run(content)

In [None]:
llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 100,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
    })

template = """
Write a concise summary of the following:

{question}

CONCISE SUMMARY:
"""

prompt = PromptTemplate(template=template, input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata


def process_documents(docs, llm_chain):
    results = []

    for doc in docs:
        content = doc.page_content
        result = llm_chain.run(content)
        results.append(result)

    return results

results = process_documents(docs, llm_chain)

# Если вы хотите совместить результаты в одну строку
combined_results = ' '.join(results)

print(combined_results)



In [None]:
llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
    })

question = """

DISPUTE RESOLUTION:
Visa Business News 

Dispute Resolution 

20 April 2023 

This article originally appeared in the Visa Europe has introduced new allowable compelling evidence in cases where the cardholder disputes a transaction that is not eligible for dispute under the standard Visa Rules. This evidence must be provided by the issuer within 45 days of receipt of the dispute from the acquirer. If the issuer fails to provide this evidence within the required timeframe, the dispute will automatically be declined. The following are the types of compelling evidence that may be submitted Effective for pre-arbitration attempts processed on or after 15 October 2023, only the items in Table 11.6—Allowable Compelling Evidence in the Visa Rules will qualify as compelling evidence for dispute conditions 10.1, 10.2, 10.3 and 10.4. In addition, for delayed transactions, the acquirer may provide evidence that the transaction relates Effective for pre-arbitration attempts processed on or after 14 October 2023, a duplicate processing dispute will 
not be allowed when the acquirer has not provided the merchant name and address where the goods or services were 
purchased. The acquirer must also provide the date the goods or services were purchased, the amount charged, and 
the method of payment used for the original purchase.  

Dispute Condition 
- Effective for disputes processed on or after 14 October 2023, an issuer will be allowed to dispute a single transaction that was processed more than once only if the transaction was processed with the same payment credential on the same transaction date and for the same transaction amount.

- Effective for disputes processed on or after 14 October 2023, the dispute rights rule will be updated to clarify that the cardholder (or 
For disputes related to MCC code 4722, the issuer must wait 30 calendar days from the date the merchant cancelled the service prior to processing a dispute.

Effective for disputes processed on or after 14 October 2023, the issuer will not be required to wait 30 calendar days for a dispute related to non-receipt of travel services from a provider that is insolvent or bankrupt, 
The rule update will require issuers and acquirers to conduct a thorough investigation before processing a dispute to ensure that the dispute is valid according to the Visa rules. The minimum dispute amount for fuel dispenser transactions will also be removed. The rule update will clarify the use of imprints in disputes and will remove obsolete language related to unattended transactions and Visa Easy Payment Service.  


"""
summary = llm_chain.run(question)
print(summary)

In [None]:

question = "What is the meaning of life?"
llm_chain.run(question)
!wget https://raw.githubusercontent.com/hwchase17/langchainjs/main/examples/state_of_the_union.txt
def search_context(src, phrase, buffer=100):
    with open(src, 'r') as f:
        txt=f.read()

    words = txt.split()
    index = words.index(phrase)
    start_index = max(0, index - buffer)
    end_index = min(len(words), index + buffer+1)
    return ' '.join(words[start_index:end_index])

fragment = './fragment.txt'
with open(fragment, 'w') as fo:
    _txt = search_context('./state_of_the_union.txt', "Ketanji")
    fo.write(_txt)

!cat $fragment
from langchain.embeddings import LlamaCppEmbeddings

llama_embeddings = LlamaCppEmbeddings(model_path=GPT4ALL_MODEL_PATH)
loader = TextLoader('./fragment.txt')

