In [None]:
import os
import glob
import pandas as pd
import json
import re
# import wandb
from datetime import datetime
# from wandb.integration.langchain import WandbTracer
from langchain.callbacks import ClearMLCallbackHandler
from langchain.memory import ConversationTokenBufferMemory
from langchain.agents.tools import Tool
from langchain.llms.base import LLM
from langchain import PromptTemplate, LLMChain
from langchain.agents import load_tools, initialize_agent, AgentExecutor, BaseSingleActionAgent, AgentType
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PDFMinerLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
os.chdir("/notebooks/learn-langchain")
from langchain_app.models.text_generation_web_ui import (
    build_text_generation_web_ui_client_llm,
)
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document

# from langchain.callbacks import WandbCallbackHandler, StdOutCallbackHandler
from langchain.callbacks import StdOutCallbackHandler

# Set Wandb API key
# os.environ["WANDB_API_KEY"] = "9ae105f6bbe37ca6eff03ea9c3af7df398713e7e"

## Test Ctransformers and Falcon-40B

In [None]:
from langchain.llms import CTransformers

config = {'max_new_tokens': 50, 'repetition_penalty': 1.1}

llm = CTransformers(model='/notebooks/text-generation-webui/models/mpt-30B-instruct-GGML/mpt-30b-instruct.ggmlv0.q4_1.bin', model_type='mpt', gpu_layers=50, config=config)

In [None]:
print(llm('AI is going to'))

# Extract JSON summarise the article and save results to a CSV file

In [None]:
# Define a function to clean and load JSON
def clean_and_load_json(json_string):
    match = re.search(r'{.*}', json_string, re.DOTALL)
    if match:
        cleaned_json_string = match.group(0)
    else:
        cleaned_json_string = json_string
    try:
        return json.loads(cleaned_json_string)
    except json.JSONDecodeError as e:
        raise ValueError(f"Couldn't decode the cleaned string as JSON: {cleaned_json_string}") from e

json_string = '''```json
{
   "category":"Visa Business News",
   "title":"Acceptance",
   "geo":"Germany",
   "audience":"Sales",
   "publication_date":"2023-05-04T00:00:00Z"
}
```'''
json_object = clean_and_load_json(json_string)
print(json_object)

In [None]:
# Create an instance of LLM for generating a JSON extract
llm_extract_json = build_text_generation_web_ui_client_llm(
    parameters={
    "max_new_tokens": 150,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.2,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
}
)

# Create an instance of LLM for generating a summary of 200 tokens - Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 300,
        "do_sample": True,
        "temperature": 0.9,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Create an instance of a text splitter to extract JSON with 5 fields
text_extract_json_5 = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=0
)

# Create an instance of a text splitter to extract JSON with 3 fields
text_extract_json_3 = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=0
)

# Create an instance of a text splitter to extract text for summary
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=100
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce")

# Define the path to the CSV file
csv_filepath = '/notebooks/files/extracted_data.csv'

# Check if the CSV file exists
if os.path.exists(csv_filepath):
    # If it exists, load it into a DataFrame
    df = pd.read_csv(csv_filepath)
else:
    # If it doesn't exist, initialize an empty DataFrame with the necessary columns
    df = pd.DataFrame(columns=[
    "effective-date", 
    "mark-your-calendar-date", 
    "article-id", 
    "category", 
    "title", 
    "geo", 
    "audience", 
    "publication_date",
    "medium_summary"
    ])

# Define the path to the folder with PDF files
input_directory = "/notebooks/files/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Define a function to sanitize a file name
def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)

# Define a function to clean and load JSON
def clean_and_load_json(json_string):
    match = re.search(r'{.*}', json_string, re.DOTALL)
    if match:
        cleaned_json_string = match.group(0)
    else:
        cleaned_json_string = json_string
    try:
        return json.loads(cleaned_json_string)
    except json.JSONDecodeError as e:
        raise ValueError(f"Couldn't decode the cleaned string as JSON: {cleaned_json_string}") from e


# Get the first PDF file path from the list
for file_path in pdf_files:

    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    # Create a template for the prompt extract to JSON with 3 fields
    
    template_json_3 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input:
    1. Article ID
    2. Effective date
    3. Mark your calendar date

    You must format your output as a JSON value that adheres to a given "JSON Schema" instance.
    "JSON Schema" is a declarative language that allows you to annotate and validate JSON documents.
    As an example, for the schema {output3_1} the object {output3_2} is a well-formatted instance of the schema.
    The object {output3_3} is not well-formatted.
    Here is the output schema: {output3_4}
    Your output will be parsed and type-checked according to the provided schema instance, so make sure all fields in your output match exactly!

    As an example, this text:

    As published in the DCC Guide—DCC Program Requirements (a Visa Supplemental Requirement; see Additional Resources), acquirers registered in the DCC Compliance Program must register their DCC-enabled merchants with Visa annually. Merchant registration information for the period ending 31 July 2023 is due 15 August 2023. This requirement applies to merchants offering DCC in either a card-present or card-not-present environment. 
    Article ID: AI13088
    Also effective 8 June 2023 edition of the Visa Business News, Visa announced updates to the ATM Locator Update System, including an updated ATM locator template,

    Mark Your Calendar:
    • DeadlineforDCCCompliance Program annual merchant registration (15 August 2023)
    • Deadlinetoderegisterasan acquirer from the DCC program prior to fiscal year 2024 billing (30 September 2023)

    Acquirers will receive an email confirming receipt of their submission and will only be contacted in the event of a problem with their registration. Further details about the DCC merchant registration process can be found in the DCC Guide—DCC Program Requirements.

    Results in the json:
        
    "article-id": "AI13088",
    "effective-date": "2023-08-15",
    "mark-your-calendar-date": "2023-06-08"

    ### Input: 
    {question}

    ### Response: 
    Valid JSON document that adheres to the schema instance: ```json
    """

    # Create a template for the prompt extract to JSON with 5 fields
    template_json_5 = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input.
    1. Category
    2. Publication date
    3. Article title
    4. GEO
    5. Article audience

    You must format your output as a JSON value that adheres to a given "JSON Schema" instance.
    "JSON Schema" is a declarative language that allows you to annotate and validate JSON documents.
    As an example, for the schema {output5_1} the object {output5_2} is a well-formatted instance of the schema.
    The object {output5_3} is not well-formatted.
    Here is the output schema: {output5_4}
    Your output will be parsed and type-checked according to the provided schema instance, so make sure all fields in your output match exactly!

    As an example, this text:
    Visa Business News
    Risk Products 1 June 2023 Card-Not-Present Token Risk Management Tools and Best Practices
    Europe | Acquirers, Issuers, Processors Visa Network; Europe Processing
    Overview: Visa is reminding clients of the tools and best practices available for managing token provisioning and transaction risk for the four main token use cases in the card-not-present environment.

    Results in the json:
    
    "category": "Risk Products",
    "publication_date": "2023-06-01",
    "title": "Card-Not-Present Token Risk Management Tools and Best Practices",
    "geo": "Europe",
    "audience": "Acquirers, Issuers, Processors"
    

    ### Input: 
    {question}

    ### Response:
    Valid JSON document that adheres to the schema instance: ```json
    """

    template_eval_json = """
    Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Format a json string in Input.

    You must format your output as a JSON value that adheres to a given "JSON Schema" instance.
    "JSON Schema" is a declarative language that allows you to annotate and validate JSON documents.

    Here is the output schema: {output}

    For example, this invalid JSON:
    {wrong_json}

    After formatting valid JSON:
    {correct_json}

    Your output will be parsed and type-checked according to the provided schema instance, so make sure all fields in your output match exactly!
    ### Input:
    {question}

    ### Response: 
    Valid JSON document that adheres to the schema instance: ```json
    """

    # Create sub-templates for the chain
    output3_1 = '{"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}'
    output3_2 = '{"foo": ["bar", "baz"]}'
    output3_3 = '{"properties": {"foo": ["bar", "baz"]}}'
    output3_4 = '{"type" : "object", "properties" : {"article-id" : {"type" : "string", "maxLength": 7}, "effective-date" : {"type" : "string", "format" : "date"}, "mark-your-calendar-date" : {"type" : "string", "format" : "date"}}}'
    output5_1 = '{"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}'
    output5_2 = '{"foo": ["bar", "baz"]}'
    output5_3 = '{"properties": {"foo": ["bar", "baz"]}}'
    output5_4 = '{"type": "object", "properties": {"category": {"type": "string"}, "title": {"type": "string"}, "geo": {"type": "string"}, "audience": {"type": "string"}, "publication_date": {"type": "string", "format": "date"}}}'
    wrong_json_3 = """
    {
            "article-id": "AI13012"
            "effective_date": "27/5/23",
            "mark_your_calendar_dates": [
                "VBS login page user experience change (27 May 23)",
                "MFA enablement window for all VBS users (25 Jun 23)"
            ]
        }
    ```
    """
    correct_json_3 = """
    {
        "audience": "Issuers",
        "article-id": "AI13012",
        "effective-date": "2023-05-27",
        "mark_your_calendar_date": ["VBS login page user experience change (2023-05-27)", "MFA enablement window for all VBS users (2023-06-25)"]
    }
    """

    wrong_json_5 = """
    {
            "category": "Commercial Solutions",
            "title": "Multi-Factor Authentication Will Be Enabled in June 2023 to Protect Visa Business Solutions Services",
            "geo": "Global"
            "audience": "Issuers",
            "publication_date": "25/6/23",
        }
    ```
    """
    correct_json_5 = """
    {
        "category": "Commercial Solutions",
        "title": "Multi-Factor Authentication Will Be Enabled in June 2023 to Protect Visa Business Solutions Services",
        "geo": "Global",
        "audience": "Issuers",
        "publication_date": "2023-06-25"
    }
    """

    # Create a prompt
    prompt_json_5 = PromptTemplate(template=template_json_5, input_variables=['question', 
                                                                              'output5_1', 
                                                                              'output5_2',
                                                                              'output5_3',
                                                                              'output5_4'
                                                                              ])
    prompt_json_3 = PromptTemplate(template=template_json_3, input_variables=['question', 
                                                                              'output3_1', 
                                                                              'output3_2',
                                                                              'output3_3',
                                                                              'output3_4'
                                                                              ])
    prompt_eval_json = PromptTemplate(template=template_eval_json, input_variables=['question',
                                                                                      'output',
                                                                                      'wrong_json',
                                                                                      'correct_json'
                                                                                      ])


    # Create a chain for generating a JSON extract
    llm_chain_json_5 = LLMChain(llm=llm, prompt=prompt_json_5)
    llm_chain_json_3 = LLMChain(llm=llm, prompt=prompt_json_3)
    llm_chain_eval_json = LLMChain(llm=llm, prompt=prompt_eval_json)

    # Extract text for processing with LLM for JSON extract with 5 fields
    docs_json_5 = text_extract_json_5.create_documents([text])

    # Get the text into a variable
    question = docs_json_5[0].page_content

    # Run the chain for JSON extract with 5 fields and load resulting JSON into a variable
    json_5 = llm_chain_json_5.run({'question': question, 
                                   'output5_1': output5_1, 
                                   'output5_2': output5_2,
                                   'output5_3': output5_3,
                                   'output5_4': output5_4
                                   })
    json_5 = llm_chain_eval_json.run({'question': json_5,
                                      'output': output5_4,
                                      'wrong_json': wrong_json_5,
                                      'correct_json': correct_json_5
                                     })
    

    # Extract text for processing with LLM for JSON extract with 3 fields
    docs_json_3 = text_extract_json_3.create_documents([text])

    # Get the text into a variable
    question = docs_json_3[0].page_content

    # Run the chain for JSON extract with 3 fields and load resulting JSON into a variable
    json_3 = llm_chain_json_3.run({'question': question, 
                                   'output3_1': output3_1, 
                                   'output3_2': output3_2,
                                   'output3_3': output3_3,
                                   'output3_4': output3_4
                                  })
    
    json_3 = llm_chain_eval_json.run({'question': json_3,
                                        'output': output3_4,
                                        'wrong_json': wrong_json_3,
                                        'correct_json': correct_json_3
                                         })
    
    # Print the file path
    print(file_path)

    # Print the results
    print(json_5, json_3)

    json_3 = clean_and_load_json(json_3)
    json_5 = clean_and_load_json(json_5)
    # Combine two JSONs into one
    combined_json = {**json_3, **json_5}

    # Split text into chunks for summary generation
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary and load resulting summary into a variable
    med_sum = chain_med_sum.run(docs)

    # Check if the summary is None and replace it with an empty string
    if med_sum is None:
        med_sum = ''

    # Convert the summary to a string
    med_sum = str(med_sum)

    # Create a new row for the DataFrame
    new_row = pd.DataFrame.from_records([combined_json])
    new_row['medium_summary'] = [med_sum]
    new_row['added'] = datetime.now()

    # Add a new row to the existing DataFrame
    df = pd.concat([df, new_row], ignore_index=True)

    # Save the DataFrame to a CSV file
    df.to_csv('/notebooks/files/extracted_data.csv', index=False)

    def rename_file(df, file_path, fields):
        # Get the last row of the DataFrame
        row = df.iloc[-1]
        
        # Initialize an empty list for the new file name components
        file_name_components = []
        
        # For each field...
        for field in fields:
            # If the field value is not NaN...
            if not pd.isna(row[field]):
                # Clean the field value and append it to the file name components
                file_name_components.append(sanitize_filename(str(row[field])))

        # If there are no valid file name components, use the original file name
        if not file_name_components:
            base_name = os.path.splitext(os.path.basename(file_path))[0]
            file_name_components.append(base_name)
                
        # Join the file name components with a dash and add the file extension
        new_file_name = '-'.join(file_name_components) + '._pdf'
    
        # Rename the file
        os.rename(file_path, os.path.join(os.path.dirname(file_path), new_file_name))

    # Call the function to rename the file at the end of processing each file
    rename_file(df, file_path, ['article-id', 'title'])

    # Print the new row of a DataFrame
    print(new_row)

--------------------

## Testing Output parser Langchain

In [None]:
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.chains import LLMChain, TransformChain, SequentialChain
from pydantic import BaseModel, Field
from typing import Optional
from datetime import date

# Create an instance of LLM for generating a JSON extract
llm_extract_json = build_text_generation_web_ui_client_llm(
    parameters={
    "max_new_tokens": 150,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.2,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 8192,
    "ban_eos_token": False,
    "skip_special_tokens": True,
}
)

model = llm_extract_json

# Define your desired data structure.
class Article(BaseModel):
    article_id: Optional[str] = Field(None, max_length=7)
    effective_date: Optional[str]
    mark_your_calendar_date: Optional[str]

# Set up a parser + inject instructions into the prompt template.
parser = PydanticOutputParser(pydantic_object=Article)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

# And a query intended to prompt a language model to populate the data structure.
article_query = """ 
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction: 
    Extract the following information from the text in input:
    1. Article ID
    2. Effective date
    3. Mark your calendar date

    ### Input:    

    Visa Business News
    Acceptance
    DCC Compliance Program: Annual DCC-Enabled Merchant / ATM Registration Requirements
    Global | Acquirers, Processors, Agents Visa, Plus Networks; V PAY
    8 June 2023
    Overview: Acquirers registered in the Dynamic Currency Conversion (DCC) Compliance Program for POS, e-commerce, and/or ATM must register their DCC-enabled merchants for the period ending 31 July 2023 by 15 August 2023 and must update the ATM Locator quarterly with their DCC-enabled ATMs. Acquirers must advise Visa by 30 September 2023 if they wish to deregister from the DCC Compliance Program for fiscal year 2024.
    As published in the DCC Guide—DCC Program Requirements (a Visa Supplemental Requirement; see Additional Resources), acquirers registered in the DCC Compliance Program must register their DCC-enabled merchants with Visa annually. Merchant registration information for the period ending 31 July 2023 is due 15 August 2023. This requirement applies to merchants offering DCC in either a card-present or card-not-present environment. Acquirers registered in the DCC Compliance Program for ATMs must provide a list of ATM locations enabled for DCC every quarter through the existing Visa ATM Locator update process.
    Providing Accurate Information
    Merchant / ATM registration details are used for DCC audit1 purposes, so acquirers must verify their accuracy. Inaccurate data, including the registration of merchants or ATMs that are not DCC-enabled, may result in non-compliance assessments (NCAs).
    For DCC-enabled merchants, the merchant name field in the merchant registration record must match the merchant name field in the BASE II clearing record, including spaces (as applicable). In addition, each merchant record must contain the exact street address of the merchant’s physical location where DCC is offered. Corporate head office addresses and/or post office box numbers are not acceptable. E-commerce merchants must list their web address.
    For quarterly ATM Locator updates, the batch file template must be updated with “Y” in column L (DYNAMIC CUR CONV) to indicate if the ATM is DCC-enabled.
    How to Register DCC Merchants and ATMs
    All DCC-registered acquirers that have successfully certified their DCC solutions with Visa must submit a complete list of their DCC-enabled merchants as of 31 July 2023 by 15 August 2023. Merchant records must be submitted
    Mark Your Calendar:
    • DeadlineforDCCCompliance Program annual merchant registration (15 August 2023)
    • Deadlinetoderegisterasan acquirer from the DCC program prior to fiscal year 2024 billing (30 September 2023)
    Article ID: AI13088
    to Visa according to the Merchant Registration Data Elements document (see Additional Resources) and emailed to DCCProgram@visa.com with “Acquirer Name (Country) - Merchant Registration” in the subject line.
    Acquirers that do not submit their merchant registration list by 15 August 2023 may be subject to NCAs. Acquirers will receive an email confirming receipt of their submission and will only be contacted in the event of a problem with their registration. Further details about the DCC merchant registration process can be found in the DCC Guide—DCC Program Requirements.
    Note: The DCC Guide—DCC Program Requirements applies to clients in all Visa regions.
    Also in the 8 June 2023 edition of the Visa Business News, Visa announced updates to the ATM Locator Update System, including an updated ATM locator template, which will take effect 2 July 2023. Refer to this article in Additional Resources for details about these updates to the quarterly ATM Locator update process.
    Acquirer Deregistration from the DCC Compliance Program
    If an acquirer wishes to stop offering DCC at its merchants or ATMs, it must deregister by sending an email to DCCProgram@visa.com with the date by which it plans to stop offering DCC.
    For the upcoming DCC program fee billing period, 1 October 2023 – 30 September 2024, acquirers must notify Visa of their intent to deregister from the DCC Compliance Program no later than 30 September 2023 to avoid assessment of the annual fee.
    1 Visa may, at its discretion, audit physical merchant locations, e-commerce merchants and ATMs to confirm compliance with the DCC rules. A DCC audit will seek to determine if the transaction is compliant with Visa disclosure, active choice, transaction receipt and DCC indicator requirements. Acquirers will be notified of the results of the audit and of any corrective action required or NCAs that may apply.
        Additional Resources
        Documents & Publications
    “New Global ATM Locator Update System,” Visa Business News, 8 June 2023
    Online Resources
    Refer to the Dynamic Currency Conversion page at Visa Online for more information and the following documentation:
    • DCC Guide—DCC Program Requirements
    • Merchant Registration Data Elements
    Visit the Visa ATM Solutions Resources page at Visa Online to view the Visa Global ATM Locator Update System Training document.
    Note: For Visa Online resources, you will be prompted to log in.
        For More Information
        AP, CEMEA, LAC: Contact your Visa representative or email DCCProgram@visa.com. Canada and U.S.: Contact eSupport@visa.com.
    Article ID: AI13088

    
    Europe: Contact Visa customer support on your country-specific number, or email CustomerSupport@visa.com or DCCProgram@visa.com.
    Third party agents: Contact your acquirer, processor or Visa representative.
        Notice: This Visa communication is furnished to you solely in your capacity as a customer of Visa Inc. (through its operating companies of Visa U.S.A Inc., Visa International Service Association, Visa Worldwide Pte. Ltd, Visa Europe Ltd., Visa International Servicios de Pago España, S.R.L.U. and Visa Canada Corporation) or its authorized agent, or as a participant in the Visa payments system. By accepting this Visa communication, you acknowledge that the information contained herein (the "Information") is confidential and subject to the confidentiality restrictions contained in the Visa Rules, which limit your use of the Information. You agree to keep the Information confidential and not to use the Information for any purpose other than in your capacity as a customer of Visa Inc. or as a participant in the Visa payments system. You may disseminate this Information to a merchant participating in the Visa payments system if: (i) you serve the role of “acquirer” within the Visa payments system; (ii) you have a direct relationship with such merchant which includes an obligation to keep Information confidential; and (iii) the Information is designated as “affects merchants” demonstrated by display of the storefront icon on the communication. A merchant receiving such Information must maintain the confidentiality of such Information and disseminate and use it on a “need to know” basis and only in their capacity as a participant in the Visa payments system. Except as otherwise provided, the Information may only be disseminated within your organization on a need-to-know basis to enable your participation in the Visa payments system. Visa is not responsible for errors in or omissions from this publication.
    Article ID: AI13088

    ### Response:
"""

llm_chain = LLMChain(
    prompt=prompt,
    llm=model,
    output_key="json_string",
)

def parse_output(inputs: dict) -> dict:
    text = inputs["json_string"]
    return {"result": parser.parse(text)}

transform_chain = TransformChain(
    input_variables=["json_string"],
    output_variables=["result"],
    transform=parse_output
)

chain = SequentialChain(
    input_variables=["query"],
    output_variables=["result"],
    chains=[llm_chain, transform_chain],
)

result = chain.run(query=article_query)
parsed_output = result["result"]

# Теперь parsed_output - это экземпляр класса Article, который можно сериализовать в JSON.
# json_output = parsed_output.json()


# Testing different parameters of the model for text summarization

In [None]:
# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test/"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Create an instance of LLM for generating a summary based on a 200 token text Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
    "do_sample": True,
    "temperature": 0.1,
    "top_p": 0.4,
    "typical_p": 1,
    "repetition_penalty": 1.18,
    "top_k": 40,
    "min_length": 0,
    "no_repeat_ngram_size": 0,
    "num_beams": 1,
    "penalty_alpha": 0,
    "length_penalty": 1,
    "early_stopping": False,
    "seed": -1,
    "add_bos_token": True,
    "truncation_length": 2048,
    "ban_eos_token": False,
    "skip_special_tokens": True,
})

# Create an instance of a text splitter for summary generation
text_splitter_med_sum = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=100
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm, chain_type="map_reduce", verbose=True)

# Create an empty list to save the results
results = []

for file_path in pdf_files:
    # Load PDF file
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    #  1. GENERATE MEDIUM SIZE SUMMARY

    # Split text into chunks of 2500 characters
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary
    docs_med_sum = chain_med_sum.run(docs)

    # Save results to list
    results.append(docs_med_sum)

# Print all results
for result in results:
    print(result)

# Суммаризация PDF файлов

## Суммаризация текста и сохранение результатов в текстовый файл

In [None]:
!pip install nltk

In [None]:
!/root/miniconda3/envs/textgen/bin/python -m spacy download en_core_web_sm

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import spacy
from langchain.text_splitter import NLTKTextSplitter
from langchain.text_splitter import SpacyTextSplitter

# Define the path to the folder with PDF files
input_directory = "/notebooks/files/test"

# Define the path to the folder with PDF files
pdf_files = glob.glob(os.path.join(input_directory, "*.pdf"))

# Create an instance of LLM for generating a summary based on a 200 token text Medium Size Summary (med_sum)
llm_med_sum = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 250,
        "do_sample": True,
        "temperature": 0.1,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.18,
        "top_k": 40,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
})

# Create an instance of a text splitter for summary generation
text_splitter_med_sum = SpacyTextSplitter(
)

# Create a chain for generating summary
chain_med_sum = load_summarize_chain(llm_med_sum, chain_type="map_reduce")

# Start processing files
for file_path in pdf_files:
    # Load PDF file
    loader = PDFMinerLoader(file_path)
    document = loader.load()
    text = document[0].page_content

    # GENERATE MEDIUM SIZE SUMMARY

    # Split text into chunks
    docs = text_splitter_med_sum.create_documents([text])

    # Send documents for processing to generate summary
    docs_med_sum = chain_med_sum.run(docs)

    # Replace the file extension with .txt and save the result
    output_file_path = os.path.splitext(file_path)[0] + ".txt"
    with open(output_file_path, "w") as output_file:
        output_file.write(docs_med_sum)

    print(f"Обработан файл: {file_path}")


In [None]:
# Берем PDF документ и делаем из него текстовый файл с помощью langchain.document_loaders.PDFMinerLoader
file_path = '/notebooks/files/AI12944 - Updates to Fraud and Consumer Dispute Rules.pdf'
loader = PDFMinerLoader(file_path)
document = loader.load()

text = document[0].page_content

llm = build_text_generation_web_ui_client_llm(parameters={
    "max_new_tokens": 200,
        "do_sample": True,
        "temperature": 0.001,
        "top_p": 0.1,
        "typical_p": 1,
        "repetition_penalty": 1.2,
        "top_k": 1,
        "min_length": 32,
        "no_repeat_ngram_size": 0,
        "num_beams": 1,
        "penalty_alpha": 0,
        "length_penalty": 1,
        "early_stopping": False,
        "seed": -1,
        "add_bos_token": True,
        "truncation_length": 2048,
        "ban_eos_token": False,
        "skip_special_tokens": True,
    })


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2500,
    chunk_overlap=0
)
docs = text_splitter.create_documents([text])

chain = load_summarize_chain(llm, chain_type="map_reduce", verbose = True)
chain.run(docs)