In [None]:
from langchain import OpenAI
from langchain import PromptTemplate

# Loaders
from langchain.schema import Document

# Splitters
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Model
from langchain.chat_models import ChatOpenAI

# Embedding Support
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings

# Summarizer we'll use for Map Reduce
from langchain.chains.summarize import load_summarize_chain

# Data Science
import numpy as np
from sklearn.cluster import KMeans

from langchain.document_loaders import PyPDFLoader

In [None]:
llm = OpenAI(temperature=0, openai_api_key='*****************************')

In [None]:
# Load the book
loader = PyPDFLoader("Eliyahu M. Goldratt, Jeff Cox - The Goal_ A Process of Ongoing Improvement-North River Press (2004).pdf")
pages = loader.load()

# Cut out the open and closing parts
pages = pages[1:393]

# Combine the pages, and replace the tabs with spaces
text = ""

for page in pages:
    text += page.page_content

text = text.replace('\t', ' ')

In [None]:
num_tokens = llm.get_num_tokens(text)

print (f"This book has {num_tokens} tokens in it")

This book has 208142 tokens in it


In [None]:
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=15000, chunk_overlap=3000)
docs = text_splitter.create_documents([text])

In [None]:
num_documents = len(docs)
print (f"Now our book is split up into {num_documents} documents")

Now our book is split up into 67 documents


In [None]:
embeddings = OpenAIEmbeddings(openai_api_key='***********************')
vectors = embeddings.embed_documents([x.page_content for x in docs])

In [None]:
num_clusters = 17
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)

In [None]:
kmeans.labels_

array([ 5, 16,  4, 14, 16, 16, 16, 11, 11,  4,  1,  5,  5,  5,  5,  1,  5,
        9,  9,  9,  7,  9,  1,  1,  8,  3,  3,  3,  3,  0,  0,  0,  0,  0,
        0,  3,  3,  1,  4,  1,  3,  5,  4,  4,  4,  4,  4,  8, 12, 13, 13,
        5,  8,  8, 10, 10,  8,  8,  2,  2,  6,  6,  6, 15, 15,  6,  6],
      dtype=int32)

In [None]:
# Find the closest embeddings to the centroids

# Create an empty list that will hold your closest points
closest_indices = []

# Loop through the number of clusters you have
for i in range(num_clusters):

    # Get the list of distances from that particular cluster center
    distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)

    # Find the list position of the closest one (using argmin to find the smallest distance)
    closest_index = np.argmin(distances)

    # Append that position to your closest indices list
    closest_indices.append(closest_index)

In [None]:
selected_indices = sorted(closest_indices)
selected_indices

[3, 5, 7, 11, 18, 20, 22, 27, 31, 44, 48, 50, 54, 56, 58, 61, 63]

In [None]:
llm3 = ChatOpenAI(temperature=0,
                 openai_api_key='***************************',
                 max_tokens=2000,
                 model='gpt-3.5-turbo'
                )

In [None]:
map_prompt = """
You will be given a single passage of a book. This section will be enclosed in triple backticks (```)
Your goal is to give a summary of this section so that a reader will have a full understanding of what happened.
Your response should be at least three paragraphs and fully encompass what was said in the passage.

```{text}```
FULL SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [None]:
map_chain = load_summarize_chain(llm=llm3,
                             chain_type="stuff",
                             prompt=map_prompt_template)

In [None]:
selected_docs = [docs[doc] for doc in selected_indices]

In [None]:

# Make an empty list to hold your summaries
summary_list = []

# Loop through a range of the lenght of your selected docs
for i, doc in enumerate(selected_docs):

    # Go get a summary of the chunk
    chunk_summary = map_chain.run([doc])

    # Append that summary to your list
    summary_list.append(chunk_summary)

    print (f"Summary #{i} (chunk #{selected_indices[i]}) - Preview: {chunk_summary[:250]} \n")

Summary #0 (chunk #3) - Preview: The passage describes the protagonist's hometown of Bearington, a factory town that has been losing major employers over the years. The protagonist reflects on the decline of the town, with old brick buildings, vacant storefronts, and industrial plan 

Summary #1 (chunk #5) - Preview: In this passage, Alex Rogo reflects on a series of events that have unfolded in his life, starting with a wild night out with a colleague, Bill, and ending with a shocking revelation at work. Peach, a key figure in Alex's career, informs him that the 

Summary #2 (chunk #7) - Preview: In this passage, the protagonist, Alex, attends a meeting at his workplace where he begins to question the purpose and goals of the company. Feeling disillusioned by the lack of clarity and understanding among his colleagues, Alex abruptly leaves the 

Summary #3 (chunk #11) - Preview: In this passage, Alex visits his mother's house in search of his old address book. Despite his mother's atte

In [None]:
summary_list

["The passage describes the protagonist's hometown of Bearington, a factory town that has been losing major employers over the years. The protagonist reflects on the decline of the town, with old brick buildings, vacant storefronts, and industrial plants closing down. The protagonist manages a manufacturing plant in Bearington and faces challenges with efficiency and meeting orders. Despite shipping a late order with a lot of effort, the protagonist realizes that the plant is on the verge of closure due to poor performance and pressure from corporate management.\n\nThe protagonist and his colleague, Donovan, celebrate shipping the order but acknowledge the high cost and inefficiencies of their actions. They discuss the challenges of running the plant and the pressure from corporate to improve performance. The protagonist worries about the future of the plant and the livelihood of its employees. He reflects on the company's strategic plan, which seems to be failing, and questions why th

In [None]:
summaries = "\n".join(summary_list)

# Convert it back to a document
summaries = Document(page_content=summaries)

print (f"Your total summary has {llm.get_num_tokens(summaries.page_content)} tokens")

Your total summary has 7293 tokens


In [None]:
llm4 = ChatOpenAI(temperature=0,
                 openai_api_key='***************************',
                 max_tokens=4096 ,
                 model='gpt-4-turbo-preview',
                 request_timeout=120
                )

In [None]:
combine_prompt = """
You will be given a series of summaries from a book. The summaries will be enclosed in triple backticks (```)
Your goal is to give a verbose summary of what happened in the story.
The reader should be able to grasp what happened in the book.
Please write 2 sentences per chunk

```{text}```
VERBOSE SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
reduce_chain = load_summarize_chain(llm=llm4,
                             chain_type="stuff",
                             prompt=combine_prompt_template,
#                              verbose=True # Set this to true if you want to see the inner workings
                                   )

In [None]:
output = reduce_chain.run([summaries])

In [None]:
print (output)

"The Goal: A Process of Ongoing Improvement" by E.M. Goldratt is a compelling narrative that follows the journey of Alex Rogo, a plant manager struggling to save his manufacturing plant and his marriage. The story begins with Alex arriving at work to find his parking space taken by his boss, Bill Peach, setting the stage for a series of challenges that highlight the power dynamics and urgent issues facing the plant, including a potential walkout, harassment claims, and missing parts for a critical customer order. Bill Peach's early arrival and demand for immediate action on the late order throw the plant into chaos, with Alex left to manage the fallout, including confrontations and productivity halts.

As Alex navigates these professional challenges, he also faces personal turmoil with his wife, Julie, who expresses dissatisfaction with their life in Bearington and desires to leave. This tension in their marriage is juxtaposed with the crisis at the plant, where a key machine breaks do