# Imports

In [None]:
%pip install tabulate numpy pandas langchain openai chromadb pypdf tiktoken faiss-cpu Flask unstructured Cython pdfminer.six termcolor tabulate tqdm reportlab


In [43]:
# Document loaders
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
# Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
# Embeddings and models
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
# Chains
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains import SimpleSequentialChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.summarize import load_summarize_chain
# Utils
import os
from termcolor import colored
import textwrap
from prettytable import PrettyTable
import ast
from tqdm.auto import tqdm
from tabulate import tabulate
import pandas as pd
import numpy as np
import string






# Import API Key
os.environ["OPENAI_API_KEY"] = "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"

# Folder Path Definition

In [28]:
folder_path = r"C:\Users\cesar\OneDrive\Desktop\test2"

# ```qa_single_file```
This iterates over each file seperately, asking each one the same question. Good for literature overview.

In [273]:
def qa_single_file(folder_path, chain_type, chunk_size, query, k, own_knowledge = False, show_pages=False):

    # Define output table.
    tables = {}

    # Wraptext function for prettytable
    def wrap_text(text, width=40):
        return "\n".join(textwrap.wrap(text, width=width))

    """
    Read-in and split the documents
    """
    # Loop over all files in folder
    for file_name in os.listdir(folder_path):
        # Clear all_pages
        all_pages = []

        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

        if show_pages:
            print(len(all_pages))
            for page in all_pages:
                print(page)

        """
        Vectorstores
        """
        embeddings = OpenAIEmbeddings()

        db = FAISS.from_documents(all_pages, embeddings)
        # FAISS vectorstores can also be merged and saved to disk

        """
        Retriever
        """
        # Amount of returned documents k
        retriever = db.as_retriever(
            search_type="similarity", search_kwargs={"k": k})

        """
        Chains
        """

        # Define Chain
        if own_knowledge:
            prompt_template = """Use the following pieces of context to find an answer to all the keys given in the question. \
                Give your answer in the form of a dictionary with the keys given in the question. \
                If the answer does not become clear from the context, you can also use your own knowledge. \
                If you use your own knowledge, please indicate this clearly in your answer. \

            Context:
            {context}

            {question}
            Helpful answer:"""

        if not own_knowledge:

            prompt_template = """Use the following pieces of context to find an answer to all the keys given in the question. \
                Give your answer in the form of a dictionary with the keys given in the question. \
                Do NOT use your own knowledge and give the best possible answer from the context.\
            
            Context:
            {context}

            {question}
            Helpful answer:"""


        PROMPT = PromptTemplate(
            template=prompt_template, input_variables=["context","question"]
        )

        chain_type_kwargs = {"prompt": PROMPT}

        # Define Chain
        qa = RetrievalQA.from_chain_type(
            llm=OpenAI(temperature=0),
            chain_type=chain_type,
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs=chain_type_kwargs
        )
        

        # Run Chain with parameters
        result = qa(query)

        # Convert string representation of dictionary to an actual dictionary
        result_dict = ast.literal_eval(result['result'])

        # Get Sources
        sources = [(os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in result['source_documents']]

        # Append result to output tables
        table_key = (file_name, tuple(sources))
        if table_key not in tables:
            table_columns = ["Filename", "Sources"] + list(result_dict.keys())
            tables[table_key] = PrettyTable(table_columns)
        table_row = [wrap_text(table_key[0]), wrap_text(', '.join([f'{source[0]} {source[1]}' for source in table_key[1]]))] + [wrap_text(str(value)) for value in result_dict.values()]
        tables[table_key].add_row(table_row)

    # Return output tables
    return tables

### Query

In [84]:
query = "model specification, model estimation, model evaluation, model deployment, benchmark models"

### Print Results
Use smaller chunk sizes to catch more different part in one prompt. Use at least as many chunks (k) as there are keys in the query.

In [106]:
# Generate Output Table
output_table = qa_single_file(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=500, 
            query=query, 
            k=5, 
            own_knowledge = True, 
            show_pages=False)


[32mFilename: file1.pdf[0m
+-----------+----------------------------------------+---------------------------------------+------------------------------------------+------------------------------------------+------------------------------------------+-----------------------------------------+
|  Filename |                Sources                 |          model specification          |             model estimation             |             model evaluation             |             model deployment             |             benchmark models            |
+-----------+----------------------------------------+---------------------------------------+------------------------------------------+------------------------------------------+------------------------------------------+-----------------------------------------+
| file1.pdf | file1.pdf page: 8, file1.pdf page: 13, |    Support vector regression (SVR),   | The weights that were generated by these | Validation is an important step tha

In [24]:
# Print Output Table
for table_key, table_value in output_table.items():
    print(colored(f"Filename: {table_key[0]}", "green"))
    print(table_value)
    print()

NameError: name 'output_table' is not defined

# ```qa_single_file_iterated```
This iterates over each file **and key** seperately to be even more accurate with the single answers, asking each one the same question. Good for literature overview.

In [29]:
def qa_single_file_iterated(folder_path, chain_type, chunk_size, queries, k, num_iterations, own_knowledge = False, show_pages=False):

    # Define output table.
    tables = {}

    # Wraptext function for prettytable
    def wrap_text(text, width=80):
        return textwrap.fill(text, width=width)

    """
    Read-in and split the documents
    """
    # Loop over all files in folder
    for file_name in tqdm(os.listdir(folder_path), desc="Processing files",  colour="green", leave=False):
        # Define table for filename
        tables[file_name] = {}

        # Clear all_pages
        all_pages = []

        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

        if show_pages:
            print(len(all_pages))
            for page in all_pages:
                print(page)

        """
        Vectorstores
        """
        embeddings = OpenAIEmbeddings()

        db = FAISS.from_documents(all_pages, embeddings)
        # FAISS vectorstores can also be merged and saved to disk


        """
        Chains
        """

        # Define Chain
        if own_knowledge:
            prompt_template = """Use the following pieces of context to find an answer to the given question. \
                If the answer does not become clear from the context, you can also use your own knowledge. \
                If you use your own knowledge, please indicate this clearly in your answer. \

            Context:
            {context}

            Question:
            Based on the context, how does the context define and apply: {question}?
            Helpful answer:"""

        if not own_knowledge:

            prompt_template = """Use the following pieces of context to find an answer to the given question. \
                Do NOT use your own knowledge and give the best possible answer from the context.\
            
            Context:
            {context}

            Question:
            Based on the context, how does the context define and apply: {question}?
            Helpful answer:"""


        PROMPT = PromptTemplate(
            template=prompt_template, input_variables=["context","question"]
        )

        chain_type_kwargs = {"prompt": PROMPT}

        """
        Retriever
        """
        
        # Define summary chain
        text_splitter = CharacterTextSplitter()
        qa_condense = load_summarize_chain(llm=OpenAI(temperature=0), chain_type="stuff")
        
        
        """
        Application of Chains
        """

        # Iterate over each query
        for query in queries:

            extended_answers = []
            unique_sources = set()

            for i in range(num_iterations):

                # QA chain that is adaptable
                # Amount of returned documents k-i -> makes it adaptable. Otherwise, it would always return k documents and the output would be the same.
                retriever = db.as_retriever(
                    search_type="similarity", search_kwargs={"k": k-i})

                # Define retrieval chain
                qa = RetrievalQA.from_chain_type(
                    llm=OpenAI(temperature=0),
                    chain_type=chain_type,
                    retriever=retriever,
                    return_source_documents=True,
                    chain_type_kwargs=chain_type_kwargs
                )

                # Run Chain with parameters
                result = qa(query)

                # Get Sources
                sources = [(doc.page_content, os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in
                        result['source_documents']]

                # Append result to extended_answers
                extended_answers.append(result['result'])

                # Add sources to the unique_sources set
                unique_sources.update(sources)



            # Combine extended_answers
            combined_result = ' '.join(extended_answers)

            # Run the qa function on the combined_result (summary)
            texts = text_splitter.split_text(combined_result)
            docs = [Document(page_content=t) for t in texts[:3]]
            
            condensed_result = str(qa_condense.run(docs))

            # Combine unique_sources
            combined_sources = {tuple([source[1], source[2]]): source[0] for source in unique_sources}


            # Store the results in the tables dictionary
            tables[file_name][query] = {
                "combined_results": combined_result,
                "condensed_result": condensed_result,
                "combined_sources": combined_sources
            }


    # Return output tables
    return tables

### Query

In [30]:
queries = ["model specification", "model estimation"]

### Print Results
Use smaller chunk sizes to catch more different part in one prompt. Use at least as many chunks (k) as there are keys in the query.

In [33]:
# Generate Output Table
output_table = qa_single_file_iterated(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=500, 
            queries=queries, 
            k=8, 
            num_iterations=2,
            own_knowledge = False, 
            show_pages=False)

                                                               4.64s/it]

In [34]:
# Wraptext function for prettytable
def wrap_text(text, width=40):
    return "\n".join(textwrap.wrap(text, width=width))

# Create a table with predefined row names
row_names = ["combined_results", "condensed_result", "combined_sources"]

# After processing all files, create and print the tables
for file_name, queries_results in output_table.items():
    # Create a table with 'Type' as the leftmost column
    table_data = [["Type"] + queries]
    
    # Add rows to the table
    for result_type in row_names:
        row = [result_type.capitalize()]
        for query in queries:
            if result_type == "combined_sources":
                sources_str = "\n".join([f"\n{colored(k[0], 'red')} {colored(k[1], 'red')}\n{v}" for k, v in queries_results[query][result_type].items()])
                row.append(wrap_text(sources_str))
            else:
                row.append(wrap_text(queries_results[query][result_type]))
        table_data.append(row)

    # Print the table for the current file_name with a separator between rows
    print(f"File: {file_name}\n{tabulate(table_data, headers='firstrow', tablefmt='grid')}\n{'=' * 80}")

File: file1.pdf
+------------------+------------------------------------------+------------------------------------------+
| Type             | model specification                      | model estimation                         |
| Combined_results | The context defines model                | Model estimation is the process of       |
|                  | specification as the combination of      | using regression techniques to compare   |
|                  | regression techniques with the cuckoo    | actual data with predicted data in order |
|                  | search algorithm, inspired by the        | to measure the performance of the        |
|                  | autoregressive moving average (ARMA)     | system. This is done by calculating the  |
|                  | model, to predict the exchange market.   | MSE, RMSE, MAE, and R-squared (R2)       |
|                  | The model is validated against 14886     | values.  Model estimation is the process |
|                  | 

# ```qa_all_at_once_iterated```
Searches for the answer through all documents. Can also take the chat history into consideration. Does iterated prompts to get better results.

In [77]:
def qa_all_at_once(folder_path, chain_type, chunk_size, query, k, own_knowledge = False, show_pages=False):

    
    """
    Read-in and split the documents
    """
    all_pages = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

    if show_pages:
        print(len(all_pages))
        for page in all_pages:
            print(page)

    """
    Vectorstores
    """
    embeddings = OpenAIEmbeddings()

    db = FAISS.from_documents(all_pages, embeddings)
    # FAISS vectorstores can also be merged and saved to disk

    """
    Retriever
    """
    # Amount of returned documents k
    retriever = db.as_retriever(
        search_type="similarity", search_kwargs={"k": k})

    """
    Chains
    """

    # Define Chain
    if own_knowledge:
        prompt_template = """Use the following pieces of chat history and context to answer the question at the end. \
            If the answer does not become clear from the context, you can also use your own knowledge. \
            If you use your own knowledge, please indicate this clearly in your answer. \

        Context:
        {context}

        {question}
        Helpful answer:"""

    if not own_knowledge:

        prompt_template = """Use the following pieces of chat history and context to answer the question at the end. \
            Do NOT use your own knowledge and give the best possible answer from the context.\
        
        Context:
        {context}

        {question}
        Helpful answer:"""


    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context","question"]
    )

    chain_type_kwargs = {"prompt": PROMPT}

    # Define Chain
    qa = RetrievalQA.from_chain_type(
        llm=OpenAI(temperature=0),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs=chain_type_kwargs
    )
    

    # Run Chain with parameters
    result = qa(query)

    
    return result

### Queries

In [79]:
chat_history = {}

In [81]:
# Define the Query
query = "What is a LSTM model?"

# Update the query with the Chat History
query_with_context = f"This is the chat history: \n {str(chat_history)} \nAnd this is the current question: \n {query}."
print(query_with_context)

This is the chat history: 
 {} 
And this is the current question: 
 What is a LSTM model?.


### Print Results
In case you do not want the chat history to be part of the prompt, change ```query=query```.

In [82]:
# Get Results
result = qa_all_at_once(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=1000, 
            query=query_with_context, 
            k=4, 
            own_knowledge = True, 
            show_pages=False)

# Append Queries and Answers to Chat History
chat_history[query] = result['result']




--------------------------------------------------------------------
[32mAnswer:[0m
--------------------------------------------------------------------

 A Long Short-Term Memory (LSTM) model is a type of Recurrent Neural Network
(RNN) that is capable of learning long-term dependencies. It is composed of four
layers: an input layer, a memory unit, a cell state, and an output layer. The
key component of the LSTM is the cell state, which runs straight down the entire
timesteps with only minor but important interactions. LSTM can add or remove
information from the cell state using several gates, each of which is made of a
sigmoid neural network layer. These sigmoid layers produce output numbers
between 0 and 1, which represent how much information is kept or removed from
the cell state. LSTM models can be trained using an optimization algorithm like
gradient descent on a set of training sequences.

--------------------------------------------------------------------
[32mSources:[0m


In [22]:
# Define Sources
sources = [(os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in result['source_documents']]

# Sort sources by filename and page number
sorted_sources = sorted(sources, key=lambda x: (x[0], int(x[1].split(" ")[1])))

# Print Answer and Sources
print("\n--------------------------------------------------------------------")
print(colored("Answer:", "green"))
print("--------------------------------------------------------------------\n")
print(textwrap.fill(result['result'], width=80))
print("\n--------------------------------------------------------------------")
print(colored("Sources:", "green"))
print("--------------------------------------------------------------------\n")
for source in sorted_sources:
    print(source)


NameError: name 'result' is not defined

# ```qa_all_at_once_iterated```
Searches for the answer through all documents. Can also take the chat history into consideration. Does iterated prompts to get better results.

In [11]:
def qa_all_at_once_iterated(folder_path, chain_type, chunk_size, query, k, num_iterations, own_knowledge = False, show_pages=False):

    
    """
    Read-in and split the documents
    """
    all_pages = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

    if show_pages:
        print(len(all_pages))
        for page in all_pages:
            print(page)

    """
    Vectorstores
    """
    embeddings = OpenAIEmbeddings()

    db = FAISS.from_documents(all_pages, embeddings)
    # FAISS vectorstores can also be merged and saved to disk


    """
    Prompts
    """

    # Define Chain
    if own_knowledge:
        prompt_template = """Use the following pieces of chat history and context to answer the question at the end. \
            If the answer does not become clear from the context, you can also use your own knowledge. \
            If you use your own knowledge, please indicate this clearly in your answer. \

        Context:
        {context}

        {question}
        Helpful answer:"""

    if not own_knowledge:

        prompt_template = """Use the following pieces of chat history and context to answer the question at the end. \
            Do NOT use your own knowledge and give the best possible answer from the context.\
        
        Context:
        {context}

        {question}
        Helpful answer:"""


    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context","question"]
    )

    chain_type_kwargs = {"prompt": PROMPT}


    """
    Chains
    """
    # Define summary chain
    text_splitter = CharacterTextSplitter()
    qa_condense = load_summarize_chain(llm=OpenAI(temperature=0), chain_type="stuff")
    
    extended_answers = []
    unique_sources = set()

    for i in range(num_iterations):

        # QA chain that is adaptable
        # Amount of returned documents k-i -> makes it adaptable. Otherwise, it would always return k documents and the output would be the same.
        retriever = db.as_retriever(
            search_type="similarity", search_kwargs={"k": k-i})

        # Define retrieval chain
        qa = RetrievalQA.from_chain_type(
            llm=OpenAI(temperature=0),
            chain_type=chain_type,
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs=chain_type_kwargs
        )

        # Run Chain with parameters
        result = qa(query)

        # Get Sources
        sources = [(doc.page_content, os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in
                result['source_documents']]

        # Append result to extended_answers
        extended_answers.append(result['result'])

        # Add sources to the unique_sources set
        unique_sources.update(sources)


    # Combine extended_answers
    combined_result = ' '.join(extended_answers)

    # Run the qa function on the combined_result (summary)
    texts = text_splitter.split_text(combined_result)
    docs = [Document(page_content=t) for t in texts[:3]]
    
    condensed_result = str(qa_condense.run(docs))

    # Combine unique_sources
    combined_sources = {tuple([source[1], source[2]]): source[0] for source in unique_sources}

    
    return combined_result, condensed_result, combined_sources

### Queries

In [13]:
chat_history = {}

In [14]:
# Define the Query
query = "What is a LSTM model?"

# Update the query with the Chat History
query_with_context = f"This is the chat history: \n {str(chat_history)} \nAnd this is the current question: \n {query}."
print(query_with_context)

This is the chat history: 
 {} 
And this is the current question: 
 What is a LSTM model?.


### Print Results
In case you do not want the chat history to be part of the prompt, change ```query=query```.

In [15]:
# Get Results
combined_result, condensed_result, combined_sources = qa_all_at_once_iterated(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=500, 
            query=query_with_context, 
            k=8, 
            num_iterations=4,
            own_knowledge = True, 
            show_pages=False)

# Append Queries and Answers to Chat History
chat_history[query] = condensed_result



--------------------------------------------------------------------
[32mCombined Answer:[0m
--------------------------------------------------------------------

 A LSTM (Long Short Term Memory) model is a variation of a recurrent neural
network which can be trained using an optimization algorithm like gradient
descent on a set of training sequences. It was first introduced by Hochreiter
and Schmidhuber in 1997 as an updated version of RNN for addressing the problems
like vanishing gradient and later was simplified or refined. LSTM is capable of
learning long term dependencies and is capable of remembering for a long period
of time using a memory unit.  A LSTM (Long Short Term Memory) model is a
variation of a recurrent neural network which can be trained using an
optimization algorithm like gradient descent on a set of training sequences. It
was first introduced by Hochreiter and Schmidhuber in 1997 as an updated version
of RNN for addressing the problems like vanishing gradient a

In [21]:
# Print Answer and Sources
print("\n--------------------------------------------------------------------")
print(colored("Combined Answer:", "green"))
print("--------------------------------------------------------------------\n")
print(textwrap.fill(combined_result, width=80))
print("\n--------------------------------------------------------------------")
print(colored("Condensed Answer:", "green"))
print("--------------------------------------------------------------------\n")
print(textwrap.fill(condensed_result, width=80))
print("\n--------------------------------------------------------------------")
print(colored("Combined Sources:", "green"))
print("--------------------------------------------------------------------\n")

# Print Sources and Sort them first
for source_key, source_element in sorted(combined_sources.items(), key=lambda x: (x[0][0], int(x[0][1].split(" ")[1]))):
    print(f'{source_key}\n{source_element}\n')




--------------------------------------------------------------------
[32mCombined Answer:[0m
--------------------------------------------------------------------

 A LSTM (Long Short Term Memory) model is a variation of a recurrent neural
network which can be trained using an optimization algorithm like gradient
descent on a set of training sequences. It was first introduced by Hochreiter
and Schmidhuber in 1997 as an updated version of RNN for addressing the problems
like vanishing gradient and later was simplified or refined. LSTM is capable of
learning long term dependencies and is capable of remembering for a long period
of time using a memory unit.  A LSTM (Long Short Term Memory) model is a
variation of a recurrent neural network which can be trained using an
optimization algorithm like gradient descent on a set of training sequences. It
was first introduced by Hochreiter and Schmidhuber in 1997 as an updated version
of RNN for addressing the problems like vanishing gradient a