In [1]:
!pip install -qqq langchain accelerate bitsandbytes
!pip install -qqq transformers==4.33.2
!pip install -qqq optimum==1.13.1
!pip install -qqq auto-gptq==0.4.2 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --progress-bar off

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import accelerate
from langchain import HuggingFacePipeline

from langchain.document_loaders import UnstructuredHTMLLoader, PyPDFLoader, DirectoryLoader

from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

import warnings
warnings.filterwarnings("ignore")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)



cuda


# LLM

In [3]:
model_name = "TheBloke/Llama-2-7b-Chat-GPTQ"

model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.float16,
                                             device_map="auto",
                                             trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

Downloading config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [4]:
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    max_new_tokens = 1024,
    top_p = 0.95,
    do_sample = True,
    repetition_penalty = 1.1,
)

llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

# Loading documents

In [5]:
!ls ../input/mixed-dataset

 Autoencoders.html  'Imbalanced Classification with Python.pdf'   cities.csv


## CSV

In [6]:
file_path ="../input/mixed-dataset/cities.csv"

csv_loader = CSVLoader(file_path=file_path)
cities_data = csv_loader.load()

In [7]:
cities_data[0]

Document(page_content='station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 0})

In [8]:
import pandas as pd
df = pd.read_csv(file_path)
df

Unnamed: 0,station_id,city_name,country,state,iso2,iso3,latitude,longitude
0,41515,Asadabad,Afghanistan,Kunar,AF,AFG,34.866000,71.150005
1,38954,Fayzabad,Afghanistan,Badakhshan,AF,AFG,37.129761,70.579247
2,41560,Jalalabad,Afghanistan,Nangarhar,AF,AFG,34.441527,70.436103
3,38947,Kunduz,Afghanistan,Kunduz,AF,AFG,36.727951,68.872530
4,38987,Qala i Naw,Afghanistan,Badghis,AF,AFG,34.983000,63.133300
...,...,...,...,...,...,...,...,...
1240,67475,Kasama,Zambia,Northern,ZM,ZMB,-10.199598,31.179947
1241,68030,Livingstone,Zambia,Southern,ZM,ZMB,-17.860009,25.860013
1242,67633,Mongu,Zambia,Western,ZM,ZMB,-15.279598,23.120025
1243,67775,Harare,Zimbabwe,Harare,ZW,ZWE,-17.817790,31.044709


## PDF

In [9]:
file_path ="../input/mixed-dataset/Imbalanced Classification with Python.pdf"

pdf_loader = PyPDFLoader(file_path=file_path)
pdf_data = pdf_loader.load_and_split()

In [10]:
pdf_data[0]

Document(page_content='Imbalanced Classiﬁcation \nwith Python\nChoose Better Metrics, Balance \nSkewed Classes, and Apply \nCost-Sensitive Learning\nJason Brownlee', metadata={'source': '../input/mixed-dataset/Imbalanced Classification with Python.pdf', 'page': 0})

## Loading directories of Mixed Data

In [11]:
folder_path = '../input/mixed-dataset/'

mixed_loader = DirectoryLoader(
    path=folder_path,
    use_multithreading=True,
    show_progress=True
)

mixed_data = mixed_loader.load_and_split()

100%|██████████| 3/3 [00:00<00:00, 1131.46it/s]


# Text Splitter

We can customize the way we partition the data-

We will use character text splitter that splits on twice the return ("\n\n") character, whereas the recursive character text splitter split on a list of elements ["\n\n", "\n", " ", ""]. This is to make sure that each chunk is of the same size.

By default, when you run the load and split function, it's using actually the recursive character text splitter.

In [12]:
# split on "\n\n"
splitter1 = CharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
)

# split ["\n\n", "\n", " ", ""]
splitter2 = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=0,
)


pdf_data1 = pdf_loader.load_and_split(text_splitter=splitter1)
pdf_data2 = pdf_loader.load_and_split(text_splitter=splitter2)

In [13]:
len(pdf_data1[4].page_content)

3206

In [14]:
len(pdf_data2[4].page_content)

955

In [15]:
mixed_loader = DirectoryLoader(
    path=folder_path,
    use_multithreading=True,
    show_progress=True
)

mixed_data = mixed_loader.load_and_split()

100%|██████████| 3/3 [00:00<00:00, 1064.45it/s]


# Summarizing

## The “Stuff” Chain

With LangChain, it is not difficult to summarize text of any length. To summarize text with an LLM, there are a few strategies.

If the whole text fits in the context window, then you can simply feed the raw data and get the result. LangChain refers to that strategy as the “stuff“ chain type.

In [16]:
chain = load_summarize_chain(
    llm=llm,
    chain_type='stuff'
)

chain.run(pdf_data1[:1])

'\nIn this article, Jason Brownlee discusses imbalanced classification in machine learning and how it can affect model performance. He emphasizes the importance of choosing better metrics, balancing skewed classes, and applying cost-sensitive learning to improve accuracy for both classes. The author provides code examples using Python and its scikit-learn library to demonstrate these techniques. By addressing these issues, the article helps readers achieve better results with their imbalanced datasets.'

In [17]:
chain.run(cities_data[:5])

'\nThere are five station entries in the provided text. The stations are located in different provinces of Afghanistan, including Kunar, Badakhshan, Nangarhar, Kunduz, and Badghis. Each station entry includes the station ID, city name, country, state, and latitude and longitude coordinates. "AF" is used as the ISO 3166-1 alpha-2 code for Afghanistan, and "AFG" is used as the ISO 3166-1 alpha-3 code."'

In [18]:
cities_data[:5]

[Document(page_content='station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 0}),
 Document(page_content='station_id: 38954\ncity_name: Fayzabad\ncountry: Afghanistan\nstate: Badakhshan\niso2: AF\niso3: AFG\nlatitude: 37.1297607616\nlongitude: 70.5792471913', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 1}),
 Document(page_content='station_id: 41560\ncity_name: Jalalabad\ncountry: Afghanistan\nstate: Nangarhar\niso2: AF\niso3: AFG\nlatitude: 34.4415269155\nlongitude: 70.4361034738', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 2}),
 Document(page_content='station_id: 38947\ncity_name: Kunduz\ncountry: Afghanistan\nstate: Kunduz\niso2: AF\niso3: AFG\nlatitude: 36.7279506623\nlongitude: 68.8725296619', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 3}),
 Document(page_content=

## Custom prompt

In [19]:
# lets check default prompt template
print(chain.llm_chain.prompt.template)

Write a concise summary of the following:


"{text}"


CONCISE SUMMARY:


In [20]:
template = """
Write a concise summary of the following in GERMAN:

"{text}"

CONCISE SUMMARY IN GERMAN:
"""

prompt = PromptTemplate.from_template(template)

chain = load_summarize_chain(
    llm=llm,
    prompt=prompt   
)

In [21]:
chain.run(cities_data[:2])

'Asadabad ist eine Stadt in Afghanistan mit der Station-ID 41515. Die Stadt liegt im Bezirk Kunar und hat eine Breite von 34,866 Grad und eine Länge von 71,15 Grad.\nFayzabad ist eine andere Stadt in Afghanistan mit der Station-ID 38954. Diese Stadt befindet sich im Bezirk Badakhshan und hat eine Breite von 37,1297 Grad und eine Länge von 70,5793 Grad.'

In [22]:
cities_data[:2]

[Document(page_content='station_id: 41515\ncity_name: Asadabad\ncountry: Afghanistan\nstate: Kunar\niso2: AF\niso3: AFG\nlatitude: 34.8660000397\nlongitude: 71.1500045859', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 0}),
 Document(page_content='station_id: 38954\ncity_name: Fayzabad\ncountry: Afghanistan\nstate: Badakhshan\niso2: AF\niso3: AFG\nlatitude: 37.1297607616\nlongitude: 70.5792471913', metadata={'source': '../input/mixed-dataset/cities.csv', 'row': 1})]

#### Unfortunately for the Stuff chain, it is going to break if the data is too large because the number of tokens sent to the LLM is larger than the context window.

## The Map-Reduce Chain

When we want to summarize a lot of data, we can use the Map-reduce strategy. We break down the data into multiple chunks, summarize each chunk, and summarize the concatenated summaries in a final "combine" step.

In [23]:
mr_chain = load_summarize_chain(
    llm=llm,
    chain_type='map_reduce'
)

In [24]:
mr_chain.run(cities_data[:3])

Downloading vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

'\nThis summary describes information about two locations in Afghanistan. Location 1 has a station ID of 38954, is located in Asadabad, Kunar, Afghanistan, and has latitude coordinates of 34.866° north and longitude coordinates of 71.15° east. Location 2 has a station ID of 41560, is located in Jalalabad, Nangarhar, Afghanistan, and has latitude coordinates of 34.4415° north and longitude coordinates of 70.4361° east.'

## Custom Prompts
The Map-reduce chain has a prompt for the map step, and a prompt for the combine step.
We can change the behavior of this chain by using custom prompts

In [25]:
print(mr_chain.llm_chain.prompt.template)

Write a concise summary of the following:


"{text}"


CONCISE SUMMARY:


In [26]:
print(mr_chain.combine_document_chain.llm_chain.prompt.template)

Write a concise summary of the following:


"{text}"


CONCISE SUMMARY:


In [27]:
map_template = """The following is a set of documents

{text}

Based on this list of docs, please identify the main themes 
Helpful Answer:"""

combine_template = """The following is a set of summaries:

{text}

Take these and distill it into a final, consolidated list of the main themes. 
Return that list as a comma separated list. 
Helpful Answer:"""


map_prompt = PromptTemplate.from_template(map_template)
combine_prompt = PromptTemplate.from_template(combine_template)

mr_custom_chain = load_summarize_chain(
    llm=llm,
    chain_type='map_reduce',
    map_prompt=map_prompt,
    combine_prompt=combine_prompt,
    verbose=True
)

mr_custom_chain.run(pdf_data2[59:60])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mThe following is a set of documents

Often in cases where the imbalance is caused by a sampling bias or measurement error, the
imbalance can be corrected by improved sampling methods, and/or correcting the measurement
error. This is because the training dataset is not a fair representation of the problem domain
that is being addressed.
The imbalance might be a property of the problem domain. For example, the natural
occurrence or presence of one class may dominate other classes. This may be because the process
that generates observations in one class is more expensive in time, cost, computation, or other
resources. As such, it is often infeasible or intractable to simply collect more samples from the
domain in order to improve the class distribution. Instead, a model is required to learn the
diﬀerence between the classes. Now that we are familiar wi

'\nThe final, consolidated list of main themes from the provided documents is:\nUnderstanding class imbalance, its causes, and its impact on machine learning models; Strategies for handling class imbalance; Importance of proper documentation; Transparency and reproducibility of results.'

## The “Refine“ chain
Another strategy to summarize text data is the Refine chain. We begin the summary with the first chunk and refine it little by little with each of the following chunks.

In [28]:
ref_chain = load_summarize_chain(
    llm=llm,
    chain_type='refine',
    verbose=True
)

In [29]:
ref_chain.run(cities_data[:1])



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"station_id: 41515
city_name: Asadabad
country: Afghanistan
state: Kunar
iso2: AF
iso3: AFG
latitude: 34.8660000397
longitude: 71.1500045859"


CONCISE SUMMARY:[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


"\nThe given text is an entry in a database, containing information about a weather station located in Asadabad, Afghanistan. The station's ID is 41515, and it is located in the province of Kunar, which is in the country of Afghanistan (AF). The station's coordinates are latitude 34.8660000397° and longitude 71.1500045859°."

This chain has the tendency to fail as shown above. This is due to the LLM sometime failing to understand the prompt. We have an initial prompt for the first summary, and a prompt for the refine step. But the LLM sometimes fails to execute correctly that refine step, breaking the whole chain. We can create custom prompts to modify the behavior of that chain:

## Custom Prompts
The Map-reduce chain has a prompt for the map step, and a prompt for the combine step.
We can change the behavior of this chain by using custom prompts

In [30]:
print(ref_chain.initial_llm_chain.prompt.template)

Write a concise summary of the following:


"{text}"


CONCISE SUMMARY:


In [31]:
print(ref_chain.refine_llm_chain.prompt.template)

Your job is to produce a final summary.
We have provided an existing summary up to a certain point: {existing_answer}
We have the opportunity to refine the existing summary (only if needed) with some more context below.
------------
{text}
------------
Given the new context, refine the original summary.
If the context isn't useful, return the original summary.


In [32]:
initial_template = """
Extract the most relevant themes from the following:


"{text}"


THEMES:"""

refine_template = """
Your job is to extract the most relevant themes
We have provided an existing list of themes up to a certain point: {existing_answer}
We have the opportunity to refine the existing list(only if needed) with some more context below.
------------
{text}
------------
Given the new context, refine the original list
If the context isn't useful, return the original list and ONLY the original list.
Return that list as a comma separated list.

LIST:"""

initial_prompt = PromptTemplate.from_template(initial_template)
refine_prompt = PromptTemplate.from_template(refine_template)

ref_custom_chain = load_summarize_chain(
    llm=llm,
    chain_type='refine',
    question_prompt=initial_prompt,
    refine_prompt=refine_prompt,
    verbose=True
)

ref_custom_chain.run(pdf_data2[59:60])



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Extract the most relevant themes from the following:


"Often in cases where the imbalance is caused by a sampling bias or measurement error, the
imbalance can be corrected by improved sampling methods, and/or correcting the measurement
error. This is because the training dataset is not a fair representation of the problem domain
that is being addressed.
The imbalance might be a property of the problem domain. For example, the natural
occurrence or presence of one class may dominate other classes. This may be because the process
that generates observations in one class is more expensive in time, cost, computation, or other
resources. As such, it is often infeasible or intractable to simply collect more samples from the
domain in order to improve the class distribution. Instead, a model is required to learn the
diﬀerence between the classes. Now that w

'\n\n1. The imbalance could be due to sampling or measurement errors and can be resolved by improving the sampling methods or measurements.\n2. The imbalance might arise from properties of the problem domain, such as dominance of one class over others.\n3. Imbalanced classifications are challenging due to inherent differences between the classes.'

# END