In [1]:
import os

openai_api_key = os.getenv('OPENAI_API_KEY')

In [2]:
# Download poppler from here - https://github.com/oschwartz10612/poppler-windows/releases/


from pdf2image import convert_from_path


def pdf_to_img_converter(pdf_path: str, output_img_folder_path: str) -> None:

    os.makedirs(output_img_folder_path, exist_ok=True)

    images = convert_from_path(pdf_path, poppler_path=r"C:/Program Files (x86)/poppler-23.07.0/Library/bin")

    for i, image in enumerate(images):
        image.save(os.path.join(output_img_folder_path, f"image_{str(i)}.jpg"))

In [3]:
pdf_path = 'D:/text-summarization-using-langchain/Machine_learning_notes.pdf'
output_img_folder_path = 'D:/text-summarization-using-langchain/pdf_images/'

pdf_to_img_converter(pdf_path=pdf_path, output_img_folder_path=output_img_folder_path)

In [4]:
import pytesseract
import cv2


pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'


def img_to_txt_converter(output_txt_folder_path: str, img_pdf_folder_path: str) -> None:

    os.makedirs(output_txt_folder_path, exist_ok=True)

    for i, resume in enumerate(os.listdir(img_pdf_folder_path)):
        img = cv2.imread(img_pdf_folder_path + "/" + resume)
        img = cv2.resize(img, (1000, 1100))
        text = pytesseract.image_to_string(img)

        fname = os.path.join(output_txt_folder_path, f"txt_{str(i)}.txt")
        with open(fname, "w", encoding='utf-8') as file:
            file.write(text)

    

In [5]:
img_pdf_folder_path = 'D:/text-summarization-using-langchain/pdf_images/'
output_txt_folder_path = 'D:/text-summarization-using-langchain/pdf_text/'

img_to_txt_converter(output_txt_folder_path=output_txt_folder_path, img_pdf_folder_path=img_pdf_folder_path)

In [6]:
def concatenate_txt_files(txt_files_path: str, output_txt_file_path: str) -> None:

    read_files = os.listdir(txt_files_path)
    with open(output_txt_file_path, "wb") as outfile:
        for f in read_files:
            with open(txt_files_path + "/" + f, "rb") as infile:
                outfile.write(infile.read())

        outfile.close()


concatenated_file_path = 'D:/text-summarization-using-langchain/output.txt'

concatenate_txt_files(txt_files_path=output_txt_folder_path, output_txt_file_path=concatenated_file_path)


In [7]:
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
article = 'D:/text-summarization-using-langchain/output.txt'

with open(article, 'r') as file:
    essay = file.read()

In [9]:
llm = OpenAI(temperature=0, openai_api_key=openai_api_key)

llm.get_num_tokens(essay)

63126

In [10]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n"], chunk_size=8000, chunk_overlap=1000)

docs = text_splitter.create_documents([essay])

In [11]:
num_docs = len(docs)

num_tokens_first_doc = llm.get_num_tokens(docs[0].page_content)

print (f"Now we have {num_docs} documents and the first one has {num_tokens_first_doc} tokens")

Now we have 33 documents and the first one has 1988 tokens


In [12]:
summary_chain = load_summarize_chain(llm=llm, chain_type='map_reduce',
#                                      verbose=True # Set verbose=True if you want to see the prompts being used
                                    )

In [13]:
output = summary_chain.run(docs)
output

' This article covers topics related to Machine Learning and Data Mining, such as Linear and Nonlinear Regression, Quadraties, Basic Probability Theory, Probability Density Functions, Estimation, Hidden Markov Models, and Bayesian Methods. It provides an overview of the different types of Machine Learning, the Markov property, Bayesian approaches to model selection, and the importance of avoiding overfitting. It also discusses methods such as Monte Carlo, Principal Components Analysis (PCA), the Method of Lagrange Multipliers, K-means and K-medoids clustering, Support Vector Machines (SVMs), AdaBoost, the Expectation-Maximization algorithm, K-Nearest Neighbors regression, and probability theory.'

In [14]:
from langchain import PromptTemplate


map_prompt = """
Write a concise summary of the following:
"{text}"
CONCISE SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [15]:
combine_prompt = """
Write a concise summary of the following text delimited by triple backquotes.
Return your response in bullet points which covers the key points of the text.
```{text}```
BULLET POINT SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [16]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
#                                      verbose=True
                                    )

In [17]:
output = summary_chain.run(docs)

'- Machine Learning and Data Mining topics include Linear Regression, Nonlinear Regression, Quadraties, Basic Probability Theory, Probability Density Functions, Estimation, Hidden Markov Models, and Bayesian Methods.\n- Monte Carlo methods are used to approximate expected values and sample from distributions.\n- Principal Components Analysis (PCA) is an unsupervised learning algorithm used to reduce the dimensionality of high-dimensional data.\n- Lagrange Multipliers method is used to optimize a function with multiple constraints.\n- K-means and K-medoids clustering are two common approaches to clustering data.\n- Mixtures of Gaussians is a generalization of K-means clustering that can handle oblong clusters and overlapping clusters.\n- The Viterbi algorithm and the Forward-Backward algorithm are two dynamic programming approaches used to compute the most likely sequence of states given a data set and a known HMM model.\n- Support Vector Machines (SVMs) are a type of optimization probl

In [18]:
print(output)

- Machine Learning and Data Mining topics include Linear Regression, Nonlinear Regression, Quadraties, Basic Probability Theory, Probability Density Functions, Estimation, Hidden Markov Models, and Bayesian Methods.
- Monte Carlo methods are used to approximate expected values and sample from distributions.
- Principal Components Analysis (PCA) is an unsupervised learning algorithm used to reduce the dimensionality of high-dimensional data.
- Lagrange Multipliers method is used to optimize a function with multiple constraints.
- K-means and K-medoids clustering are two common approaches to clustering data.
- Mixtures of Gaussians is a generalization of K-means clustering that can handle oblong clusters and overlapping clusters.
- The Viterbi algorithm and the Forward-Backward algorithm are two dynamic programming approaches used to compute the most likely sequence of states given a data set and a known HMM model.
- Support Vector Machines (SVMs) are a type of optimization problem that 