In [None]:
import os
import glob
import pandas as pd
import json
import re
import wandb
from datetime import datetime
from wandb.integration.langchain import WandbTracer
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.tools import Tool
from langchain.llms.base import LLM
from langchain import PromptTemplate, LLMChain
from langchain.agents import load_tools, initialize_agent, AgentExecutor, BaseSingleActionAgent, AgentType
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
os.chdir("/notebooks/learn-langchain")
from langchain_app.models.text_generation_web_ui import (
    build_text_generation_web_ui_client_llm,
)
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.callbacks import WandbCallbackHandler, StdOutCallbackHandler

# Set Wandb API key
os.environ["WANDB_API_KEY"] = "9ae105f6bbe37ca6eff03ea9c3af7df398713e7e"

# Extract JSON summarise the article and save results to a CSV file

In [None]:
session_group = datetime.now().strftime("%m.%d.%Y_%H.%M.%S")
wandb_callback = WandbTracer({"project": "wandb_prompts_quickstart"}
)
callbacks = [StdOutCallbackHandler(), wandb_callback]

In [None]:
# Define a function to clean and load JSON
def clean_and_load_json(json_string):
    match = re.search(r'{.*}', json_string, re.DOTALL)
    if match:
        cleaned_json_string = match.group(0)
    else:
        cleaned_json_string = json_string
    try:
        return json.loads(cleaned_json_string)
    except json.JSONDecodeError as e:
        raise ValueError(f"Couldn't decode the cleaned string as JSON: {cleaned_json_string}") from e

json_string = '''```json
{
   "category":"Visa Business News",
   "title":"Acceptance",
   "geo":"Germany",
   "audience":"Sales",
   "publication_date":"2023-05-04T00:00:00Z"
}
```'''
json_object = clean_and_load_json(json_string)
print(json_object)

In [None]:
# Create an instance of LLM for generating a JSON extract
llm_extract_json = build_text_generation_web_ui_client_llm(
    parameters={
    "max_new_tokens": 100,
    "do_sample": True,
    "temperature": 0.01,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.2,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
}
)

# Create an instance of LLM for generating a summary of 200 tokens - Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.01,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Create an instance of a text splitter to extract JSON with 5 fields
text_extract_json_5 = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=0
)

# Create an instance of a text splitter to extract JSON with 3 fields
text_extract_json_3 = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=0
)

# Create an instance of a text splitter to extract text for summary
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce")

# Define the path to the CSV file
csv_filepath = '/notebooks/files/extracted_data.csv'

# Check if the CSV file exists
if os.path.exists(csv_filepath):
    # If it exists, load it into a DataFrame
    df = pd.read_csv(csv_filepath)
else:
    # If it doesn't exist, initialize an empty DataFrame with the necessary columns
    df = pd.DataFrame(columns=[
    "effective-date", 
    "mark-your-calendar-date", 
    "article-id", 
    "category", 
    "title", 
    "geo", 
    "audience", 
    "publication_date",
    "medium_summary"
    ])

# Define the path to the folder with PDF files
input_directory = "/notebooks/files/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Define a function to sanitize a file name
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

# Define a function to clean and load JSON
def clean_and_load_json(json_string):
    match = re.search(r'{.*}', json_string, re.DOTALL)
    if match:
        cleaned_json_string = match.group(0)
    else:
        cleaned_json_string = json_string
    try:
        return json.loads(cleaned_json_string)
    except json.JSONDecodeError as e:
        raise ValueError(f"Couldn't decode the cleaned string as JSON: {cleaned_json_string}") from e


# Get the first PDF file path from the list
for file_path in pdf_files:

    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    # Create a template for the prompt extract to JSON with 3 fields
    
    template_json_3 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input:
    1. Article ID
    2. Effective date
    3. Mark your calendar date

    You must format your output as a JSON value that adheres to a given "JSON Schema" instance.
    "JSON Schema" is a declarative language that allows you to annotate and validate JSON documents.
    As an example, for the schema {output3_1} the object {output3_2} is a well-formatted instance of the schema.
    The object {output3_3} is not well-formatted.
    Here is the output schema: {output3_4}
    Your output will be parsed and type-checked according to the provided schema instance, so make sure all fields in your output match exactly!

    ### Input: 
    {question}

    ### Response:
    JSON:
    """

    # Create a template for the prompt extract to JSON with 5 fields
    template_json_5 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input.
    1. Category
    2. Title
    3. GEO
    4. Audience
    5. Publication date

    You must format your output as a JSON value that adheres to a given "JSON Schema" instance.
    "JSON Schema" is a declarative language that allows you to annotate and validate JSON documents.
    As an example, for the schema {output5_1} the object {output5_2} is a well-formatted instance of the schema.
    The object {output5_3} is not well-formatted.
    Here is the output schema: {output5_4}
    Your output will be parsed and type-checked according to the provided schema instance, so make sure all fields in your output match exactly!

    ### Input: 
    {question}

    ### Response:
    JSON:
    """

    template_eval_json_5 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Evaluate the JSON in the Input below.
    Remove unnecessary characters or add missing characters to make the JSON valid.

    ### Input: 
    {question}

    ### Response:
    JSON:
    """

    # Create sub-templates for the chain
    output3_1 = '{"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}'
    output3_2 = '{"foo": ["bar", "baz"]}'
    output3_3 = '{"properties": {"foo": ["bar", "baz"]}}'
    output3_4 = '{"type" : "object", "properties" : {"article-id" : {"type" : "string"}, {"effective-date" : {"type" : "date"}, {"mark-your-calendar-date" : {"type" : "date"} } } } }'
    output5_1 = '{"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}'
    output5_2 = '{"foo": ["bar", "baz"]}'
    output5_3 = '{"properties": {"foo": ["bar", "baz"]}}'
    output5_4 = '{"type" : "object", "properties" : {"category" : {"type" : "string"}, {"title" : {"type" : "string"}, {"geo" : {"type" : "string"}, {"audience" : {"type" : "string"}, {"publication_date" : {"type" : "date"} } } } } } }'

    # Create a prompt
    prompt_json_5 = PromptTemplate(template=template_json_5, input_variables=['question', 
                                                                              'output5_1', 
                                                                              'output5_2',
                                                                              'output5_3',
                                                                              'output5_4'
                                                                              ])
    prompt_json_3 = PromptTemplate(template=template_json_3, input_variables=['question', 
                                                                              'output3_1', 
                                                                              'output3_2',
                                                                              'output3_3',
                                                                              'output3_4'
                                                                              ])
    prompt_eval_json_5 = PromptTemplate(template=template_eval_json_5, input_variables=['question'
                                                                                ])


    # Create a chain for generating a JSON extract
    llm_chain_json_5 = LLMChain(llm=llm_extract_json, prompt=prompt_json_5)
    llm_chain_json_3 = LLMChain(llm=llm_extract_json, prompt=prompt_json_3)
    llm_chain_eval_json_5 = LLMChain(llm=llm_extract_json, prompt=prompt_eval_json_5)

    # Extract text for processing with LLM for JSON extract with 5 fields
    docs_json_5 = text_extract_json_5.create_documents([text])

    # Get the text into a variable
    question = docs_json_5[0].page_content

    # Run the chain for JSON extract with 5 fields and load resulting JSON into a variable
    json_5 = llm_chain_json_5.run({'question': question, 
                                   'output5_1': output5_1, 
                                   'output5_2': output5_2,
                                   'output5_3': output5_3,
                                   'output5_4': output5_4
                                   })
    json_5 = llm_chain_eval_json_5.run({'question': json_5
                                       })
    

    # Extract text for processing with LLM for JSON extract with 3 fields
    docs_json_3 = text_extract_json_3.create_documents([text])

    # Get the text into a variable
    question = docs_json_3[0].page_content

    # Run the chain for JSON extract with 3 fields and load resulting JSON into a variable
    json_3 = llm_chain_json_3.run({'question': question, 
                                   'output3_1': output3_1, 
                                   'output3_2': output3_2,
                                   'output3_3': output3_3,
                                   'output3_4': output3_4
                                  })
    
    # Print the file path
    print(file_path)

    # Print the results
    print(json_5, json_3)

    json_3 = json.loads(json_3)
    json_5 = json.loads(json_5)
    # Combine two JSONs into one
    combined_json = {**json_3, **json_5}

    # Split text into chunks for summary generation
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary and load resulting summary into a variable
    med_sum = chain_med_sum.run(docs)

    # Check if the summary is None and replace it with an empty string
    if med_sum is None:
        med_sum = ''

    # Convert the summary to a string
    med_sum = str(med_sum)

    # Create a new row for the DataFrame
    new_row = pd.DataFrame.from_records([combined_json])
    new_row['medium_summary'] = [med_sum]

    # Add a new row to the existing DataFrame
    df = pd.concat([df, new_row], ignore_index=True)

    # Save the DataFrame to a CSV file
    df.to_csv('/notebooks/files/extracted_data.csv', index=False)

    def rename_file(df, file_path, fields):
        # Get the last row of the DataFrame
        row = df.iloc[-1]
        
        # Initialize an empty list for the new file name components
        file_name_components = []
        
        # For each field...
        for field in fields:
            # If the field value is not NaN...
            if not pd.isna(row[field]):
                # Clean the field value and append it to the file name components
                file_name_components.append(sanitize_filename(str(row[field])))

        # If there are no valid file name components, use the original file name
        if not file_name_components:
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            file_name_components.append(base_name)
                
        # Join the file name components with a dash and add the file extension
        new_file_name = '-'.join(file_name_components) + '._pdf'
    
        # Rename the file
        os.rename(file_path, os.path.join(os.path.dirname(file_path), new_file_name))

    # Call the function to rename the file at the end of processing each file
    rename_file(df, file_path, ['article-id', 'title'])

    # Print the new row of a DataFrame
    print(new_row)



--------------------

# Testing different parameters of the model for text summarization

In [None]:
# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Create an instance of LLM for generating a summary based on a 200 token text Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.18,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
})

# Create an instance of a text splitter for summary generation
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce", verbose=True)

# Create an empty list to save the results
results = []

for file_path in pdf_files:
    # Load PDF file
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #  1. GENERATE MEDIUM SIZE SUMMARY

    # Split text into chunks of 2500 characters
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary
    docs_med_sum = chain_med_sum.run(docs)

    # Save results to list
    results.append(docs_med_sum)

# Print all results
for result in results:
    print(result)

# Суммаризация PDF файлов

## Суммаризация текста и сохранение результатов в текстовый файл

In [None]:
# Определяем путь к папке с PDF-файлами
input_directory = "/notebooks/files"

# Получаем список PDF-файлов
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Создаем инстанс LLM для генерации summary на основе текста размером 200 токенов Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Создаем инстанс разделителя текста на 2500 символов с перекрытием в 0 символов (для генерации summary)
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Создаем цепочку для генерации summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce")


for file_path in pdf_files:
    # Загружаем PDF-файл
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #1. ГЕНЕРАЦИЯ MEDIAN SIZE SUMMARY

    #Разрезаем текст на куски по 2500 символов 
    docs = text_splitter_med_sum.create_documents([text])
    # Отправляем документы в обработку для генерации summary
    docs_med_sum = chain_med_sum.run(docs)

    # Заменяем расширение файла на .txt и сохраняем результат
    output_file_path = os.path.splitext(file_path)[0] + ".txt"
    with open(output_file_path, "w") as output_file:
        output_file.write(docs_med_sum)

    print(f"Обработан файл: {file_path}")


In [None]:
# Берем PDF документ и делаем из него текстовый файл с помощью langchain.document_loaders.PDFMinerLoader
file_path = '/notebooks/files/AI12944 - Updates to Fraud and Consumer Dispute Rules.pdf'
loader = PDFMinerLoader(file_path)
document = loader.load()

text = document[0].page_content

llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
    })


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

chain = load_summarize_chain(llm, chain_type="map_reduce", verbose = True)
chain.run(docs)