In [None]:
import os
import glob
import pandas as pd
import json
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.tools import Tool
from langchain.llms.base import LLM
from langchain import PromptTemplate, LLMChain
from langchain.agents import load_tools, initialize_agent, AgentExecutor, BaseSingleActionAgent, AgentType
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
os.chdir("/notebooks/learn-langchain")
from langchain_app.models.text_generation_web_ui import (
    build_text_generation_web_ui_client_llm,
)
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.callbacks import AimCallbackHandler, StdOutCallbackHandler

# Testing different parameters of the model for text summarization

In [None]:
# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Create an instance of LLM for generating a summary based on a 200 token text Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.18,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
})

# Create an instance of a text splitter for summary generation
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce", verbose=True)

# Create an empty list to save the results
results = []

for file_path in pdf_files:
    # Load PDF file
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #  1. GENERATE MEDIUM SIZE SUMMARY

    # Split text into chunks of 2500 characters
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary
    docs_med_sum = chain_med_sum.run(docs)

    # Save results to list
    results.append(docs_med_sum)

# Print all results
for result in results:
    print(result)

# Testing different parameters of the model for extracting JSON

In [None]:
# Create an instance of LLM for generating a JSON extract
llm_extract_json = build_text_generation_web_ui_client_llm(
    parameters={
    "max_new_tokens": 100,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.2,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
}
)

# Create an instance of LLM for generating a summary of 200 tokens - Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Create an instance of a text splitter to extract JSON with 5 fields
text_extract_json_5 = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=0
)

# Create an instance of a text splitter to extract JSON with 3 fields
text_extract_json_3 = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=0
)

# Create an instance of a text splitter to extract text for summary
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce")

# Create a template for the prompt extract to JSON with 3 fields
template_json_3 = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: 
Extract the following information from the text in input.
1. Effective date
2. Mark your calendar date
3. Article ID

Format instructions: JSON
If there is no information in the text: empty string
The output looks like this:
{output3}

### Input: 
{question}

### Response:
"""

# Create a template for the prompt extract to JSON with 5 fields
template_json_5 = """
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: 
Extract the following information from the text in input.
1. Category
2. Title
3. GEO
4. Audience
5. Publication date

Format instructions: JSON
If there is no information in the text: empty string
# Input example: 
Visa Business News
Visa Rules 13 April 2023 Introduction of Visa Rules Waiver Extension Assessments
AP, CEMEA, Europe, LAC | Acquirers, Issuers, Processors, Agents Visa, Interlink Networks; V PAY
Overview: Effective 14 October 2023, Visa will introduce waiver extension assessments for approved Visa Rules waiver extensions.
The Visa Rules are designed to minimize risks and provide a common, convenient
# Response example:
{output5}

### Input: 
{question}

### Response:
"""

# Create sub-templates for the chain
output3 = """
{"effective_date": "yyyy-mm-dd", "mark_your_calendar_date": "yyyy-mm-dd", "article_id": ""}
"""
output5 = """
{"category": "Visa Rules", "title": "Introduction of Visa Rules Waiver Extension Assessments", "geo": "AP, CEMEA, Europe, LAC", "audience": "Acquirers, Issuers, Processors, Agents", "publication_date": "2023-04-13"}
"""

# Create a prompt
prompt_json_5 = PromptTemplate(template=template_json_5, input_variables=['question', 'output5'])
prompt_json_3 = PromptTemplate(template=template_json_3, input_variables=['question', 'output3'])

# Create a chain for generating a JSON extract
llm_chain_json_5 = LLMChain(llm=llm_extract_json, prompt=prompt_json_5)
llm_chain_json_3 = LLMChain(llm=llm_extract_json, prompt=prompt_json_3)

docs_json_5 = text_extract_json_5.create_documents([text])
docs_json_3 = text_extract_json_3.create_documents([text])

# Initialize the DataFrame with required columns
df = pd.DataFrame(columns=[
    "effective_date", 
    "mark_your_calendar_date", 
    "article_id", 
    "category", 
    "title", 
    "geo", 
    "audience", 
    "publication_date",
    "medium_summary"
])

# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Get the first PDF file path from the list
for file_path in pdf_files:

    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    question = docs_json_5[0].page_content

    json_5 = json.loads(llm_chain_json_5.run({'question': question, 'output5': output5}))

    question = docs_json_3[0].page_content

    json_3 = json.loads(llm_chain_json_3.run({'question': question, 'output3': output3}))

    combined_json = {**json_3, **json_5}

    #Разрезаем текст на куски по 2500 символов 
    docs = text_splitter_med_sum.create_documents([text])
    # Отправляем документы в обработку для генерации summary
    docs_med_sum = chain_med_sum.run(docs)

    # Используем метод from_records для создания DataFrame из списка словарей
    new_row = pd.DataFrame.from_records([combined_json])

    new_row['medium_summary'] = docs_med_sum

    # Добавляем новую строку к существующему DataFrame
    df = pd.concat([df, new_row], ignore_index=True)

# Print the DataFrame
df

--------------------

# Суммаризация PDF файлов

## Суммаризация текста и сохранение результатов в текстовый файл

In [None]:
# Определяем путь к папке с PDF-файлами
input_directory = "/notebooks/files"

# Получаем список PDF-файлов
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Создаем инстанс LLM для генерации summary на основе текста размером 200 токенов Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Создаем инстанс разделителя текста на 2500 символов с перекрытием в 0 символов (для генерации summary)
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Создаем цепочку для генерации summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce")


for file_path in pdf_files:
    # Загружаем PDF-файл
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #1. ГЕНЕРАЦИЯ MEDIAN SIZE SUMMARY

    #Разрезаем текст на куски по 2500 символов 
    docs = text_splitter_med_sum.create_documents([text])
    # Отправляем документы в обработку для генерации summary
    docs_med_sum = chain_med_sum.run(docs)

    # Заменяем расширение файла на .txt и сохраняем результат
    output_file_path = os.path.splitext(file_path)[0] + ".txt"
    with open(output_file_path, "w") as output_file:
        output_file.write(docs_med_sum)

    print(f"Обработан файл: {file_path}")


In [None]:
# Берем PDF документ и делаем из него текстовый файл с помощью langchain.document_loaders.PDFMinerLoader
file_path = '/notebooks/files/AI12944 - Updates to Fraud and Consumer Dispute Rules.pdf'
loader = PDFMinerLoader(file_path)
document = loader.load()

text = document[0].page_content

llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
    })


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

chain = load_summarize_chain(llm, chain_type="map_reduce", verbose = True)
chain.run(docs)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1700,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

content = docs[0].page_content
print(content)