# Imports

In [None]:
%pip install langchain openai chromadb pypdf tiktoken faiss-cpu Flask unstructured Cython pdfminer.six termcolor prettytable tqdm


In [148]:
# Document loaders
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import Docx2txtLoader
from langchain.document_loaders import UnstructuredMarkdownLoader
# Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
# Embeddings and models
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
# Chains
from langchain.chains import RetrievalQA
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.chains import SimpleSequentialChain
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains.question_answering import load_qa_chain
# Utils
import os
from termcolor import colored
import textwrap
from prettytable import PrettyTable
import ast
from tqdm.auto import tqdm


# Import API Key
from apikey import API_KEY
os.environ["OPENAI_API_KEY"] = "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"

  from .autonotebook import tqdm as notebook_tqdm


# Folder Path Definition

In [5]:
folder_path = r"C:\Users\cesar\OneDrive\Desktop\test2"

# ```qa_single_file```
This iterates over each file seperately, asking each one the same question. Good for literature overview.

In [103]:
def qa_single_file(folder_path, chain_type, chunk_size, query, k, own_knowledge = False, show_pages=False):

    # Define output table.
    tables = {}

    # Wraptext function for prettytable
    def wrap_text(text, width=40):
        return "\n".join(textwrap.wrap(text, width=width))

    """
    Read-in and split the documents
    """
    # Loop over all files in folder
    for file_name in os.listdir(folder_path):
        # Clear all_pages
        all_pages = []

        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

        if show_pages:
            print(len(all_pages))
            for page in all_pages:
                print(page)

        """
        Vectorstores
        """
        embeddings = OpenAIEmbeddings()

        db = FAISS.from_documents(all_pages, embeddings)
        # FAISS vectorstores can also be merged and saved to disk

        """
        Retriever
        """
        # Amount of returned documents k
        retriever = db.as_retriever(
            search_type="similarity", search_kwargs={"k": k})

        """
        Chains
        """

        # Define Chain
        if own_knowledge:
            prompt_template = """Use the following pieces of context to find an answer to all the keys given in the question. \
                Give your answer in the form of a dictionary with the keys given in the question. \
                If the answer does not become clear from the context, you can also use your own knowledge. \
                If you use your own knowledge, please indicate this clearly in your answer. \

            Context:
            {context}

            {question}
            Helpful answer:"""

        if not own_knowledge:

            prompt_template = """Use the following pieces of context to find an answer to all the keys given in the question. \
                Give your answer in the form of a dictionary with the keys given in the question. \
                Do NOT use your own knowledge and give the best possible answer from the context.\
            
            Context:
            {context}

            {question}
            Helpful answer:"""


        PROMPT = PromptTemplate(
            template=prompt_template, input_variables=["context","question"]
        )

        chain_type_kwargs = {"prompt": PROMPT}

        # Define Chain
        qa = RetrievalQA.from_chain_type(
            llm=OpenAI(temperature=0),
            chain_type=chain_type,
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs=chain_type_kwargs
        )
        

        # Run Chain with parameters
        result = qa(query)

        # Convert string representation of dictionary to an actual dictionary
        result_dict = ast.literal_eval(result['result'])

        # Get Sources
        sources = [(os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in result['source_documents']]

        # Append result to output tables
        table_key = (file_name, tuple(sources))
        if table_key not in tables:
            table_columns = ["Filename", "Sources"] + list(result_dict.keys())
            tables[table_key] = PrettyTable(table_columns)
        table_row = [wrap_text(table_key[0]), wrap_text(', '.join([f'{source[0]} {source[1]}' for source in table_key[1]]))] + [wrap_text(str(value)) for value in result_dict.values()]
        tables[table_key].add_row(table_row)

    # Return output tables
    return tables

### Query

In [84]:
query = "model specification, model estimation, model evaluation, model deployment, benchmark models"

### Print Results
Use smaller chunk sizes to catch more different part in one prompt. Use at least as many chunks (k) as there are keys in the query.

In [106]:
# Generate Output Table
output_table = qa_single_file(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=500, 
            query=query, 
            k=5, 
            own_knowledge = True, 
            show_pages=False)

# Print Output Table
for table_key, table_value in output_table.items():
    print(colored(f"Filename: {table_key[0]}", "green"))
    print(table_value)
    print()

[32mFilename: file1.pdf[0m
+-----------+----------------------------------------+---------------------------------------+------------------------------------------+------------------------------------------+------------------------------------------+-----------------------------------------+
|  Filename |                Sources                 |          model specification          |             model estimation             |             model evaluation             |             model deployment             |             benchmark models            |
+-----------+----------------------------------------+---------------------------------------+------------------------------------------+------------------------------------------+------------------------------------------+-----------------------------------------+
| file1.pdf | file1.pdf page: 8, file1.pdf page: 13, |    Support vector regression (SVR),   | The weights that were generated by these | Validation is an important step tha

# ```qa_single_file_iterated```
This iterates over each file **and key** seperately to be even more accurate with the single answers, asking each one the same question. Good for literature overview.

In [175]:
def qa_single_file_iterated(folder_path, chain_type, chunk_size, queries, k, own_knowledge = False, show_pages=False):

    # Define output table.
    tables = {}

    # Wraptext function for prettytable
    def wrap_text(text, width=80):
        return textwrap.fill(text, width=width)

    """
    Read-in and split the documents
    """
    # Loop over all files in folder
    for file_name in tqdm(os.listdir(folder_path), desc="Processing files",  colour="green", leave=False):
        # Clear all_pages
        all_pages = []

        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

        if show_pages:
            print(len(all_pages))
            for page in all_pages:
                print(page)

        """
        Vectorstores
        """
        embeddings = OpenAIEmbeddings()

        db = FAISS.from_documents(all_pages, embeddings)
        # FAISS vectorstores can also be merged and saved to disk

        """
        Retriever
        """
        # Amount of returned documents k
        retriever = db.as_retriever(
            search_type="similarity", search_kwargs={"k": k})

        """
        Chains
        """

        # Define Chain
        if own_knowledge:
            prompt_template = """Use the following pieces of context to find an answer to the given question. \
                If the answer does not become clear from the context, you can also use your own knowledge. \
                If you use your own knowledge, please indicate this clearly in your answer. \

            Context:
            {context}

            Question:
            Based on the context, how does the context define and apply: {question}?
            Helpful answer:"""

        if not own_knowledge:

            prompt_template = """Use the following pieces of context to find an answer to the given question. \
                Do NOT use your own knowledge and give the best possible answer from the context.\
            
            Context:
            {context}

            Question:
            Based on the context, how does the context define and apply: {question}?
            Helpful answer:"""


        PROMPT = PromptTemplate(
            template=prompt_template, input_variables=["context","question"]
        )

        chain_type_kwargs = {"prompt": PROMPT}

        # Define Chain
        qa = RetrievalQA.from_chain_type(
            llm=OpenAI(temperature=0),
            chain_type=chain_type,
            retriever=retriever,
            return_source_documents=True,
            chain_type_kwargs=chain_type_kwargs
        )
        

        # Iterate over each query
        for query in queries:
            # Run Chain with parameters
            result = qa(query)

            # Get Sources
            sources = [({doc.page_content}, os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in
                    result['source_documents']]

            # Append result to output tables
            table_key = file_name
            if table_key not in tables:
                table_columns = [query]
                tables[table_key] = PrettyTable(table_columns)
            else:
                if query not in tables[table_key].field_names:
                    empty_column = [""] * len(tables[table_key]._rows)
                    tables[table_key].add_column(query, empty_column)

            # Add sources to the result value
            result_with_sources = colored(wrap_text(str(result['result'])), "green") + "\n Sources:" + wrap_text('\n'.join([f'{source[0]} {source[1]} {source[2]}' for source in sources]))


            # Update the last row of the table or add a new row
            if len(tables[table_key]._rows) > 0:
                last_row_index = len(tables[table_key]._rows) - 1
                tables[table_key]._rows[last_row_index][tables[table_key].field_names.index(query)] = wrap_text(result_with_sources)
            else:
                new_row = [""] * len(tables[table_key].field_names)
                new_row[tables[table_key].field_names.index(query)] = wrap_text(result_with_sources)
                tables[table_key].add_row(new_row)




    # Return output tables
    return tables

### Query

In [111]:
queries = ["model specification", "model estimation", "model evaluation", "model deployment", "benchmark models"]

### Print Results
Use smaller chunk sizes to catch more different part in one prompt. Use at least as many chunks (k) as there are keys in the query.

In [176]:
# Generate Output Table
output_table = qa_single_file_iterated(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=500, 
            queries=queries, 
            k=8, 
            own_knowledge = False, 
            show_pages=False)

# Print Output Table
for table_key, table_value in output_table.items():
    print(colored(f"Filename: {table_key}", "green"))
    print(table_value)
    print()

                                                               3.58s/it]

[32mFilename: file1.pdf[0m
+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|                               model specification                                |                                 model estimation                                 |                                 model evaluation                                 |                                 model deployment                                 |                                 benchmark models                                 |
+----------------------------------------------------------------------------------+-----------------------------------------------------



# ```qa_all_at_once```
Searches for the answer through all documents. Can also take the chat history into consideration.

In [77]:
def qa_all_at_once(folder_path, chain_type, chunk_size, query, k, own_knowledge = False, show_pages=False):

    
    """
    Read-in and split the documents
    """
    all_pages = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif file_name.endswith('.csv'):
            loader = CSVLoader(file_path)
        elif file_name.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        elif file_name.endswith('.md'):
            loader = UnstructuredMarkdownLoader(file_path)
        else:
            continue  # Skip files with other extensions

        file = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/2)
        pages = text_splitter.split_documents(file)
        all_pages.extend(pages)

    if show_pages:
        print(len(all_pages))
        for page in all_pages:
            print(page)

    """
    Vectorstores
    """
    embeddings = OpenAIEmbeddings()

    db = FAISS.from_documents(all_pages, embeddings)
    # FAISS vectorstores can also be merged and saved to disk

    """
    Retriever
    """
    # Amount of returned documents k
    retriever = db.as_retriever(
        search_type="similarity", search_kwargs={"k": k})

    """
    Chains
    """

    # Define Chain
    if own_knowledge:
        prompt_template = """Use the following pieces of chat history and context to answer the question at the end. \
            If the answer does not become clear from the context, you can also use your own knowledge. \
            If you use your own knowledge, please indicate this clearly in your answer. \

        Context:
        {context}

        {question}
        Helpful answer:"""

    if not own_knowledge:

        prompt_template = """Use the following pieces of chat history and context to answer the question at the end. \
            Do NOT use your own knowledge and give the best possible answer from the context.\
        
        Context:
        {context}

        {question}
        Helpful answer:"""


    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context","question"]
    )

    chain_type_kwargs = {"prompt": PROMPT}

    # Define Chain
    qa = RetrievalQA.from_chain_type(
        llm=OpenAI(temperature=0),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs=chain_type_kwargs
    )
    

    # Run Chain with parameters
    result = qa(query)

    
    return result

### Queries

In [79]:
chat_history = {}

In [81]:
# Define the Query
query = "What is a LSTM model?"

# Update the query with the Chat History
query_with_context = f"This is the chat history: \n {str(chat_history)} \nAnd this is the current question: \n {query}."
print(query_with_context)

This is the chat history: 
 {} 
And this is the current question: 
 What is a LSTM model?.


### Print Results
In case you do not want the chat history to be part of the prompt, change ```query=query```.

In [82]:
# Get Results
result = qa_all_at_once(folder_path=folder_path, 
            chain_type="stuff",
            chunk_size=1000, 
            query=query_with_context, 
            k=4, 
            own_knowledge = True, 
            show_pages=False)

# Append Queries and Answers to Chat History
chat_history[query] = result['result']

# Define Sources
sources = [(os.path.basename(doc.metadata["source"]), f"page: {doc.metadata['page']}") for doc in result['source_documents']]

# Print Answer and Sources
print("\n--------------------------------------------------------------------")
print(colored("Answer:", "green"))
print("--------------------------------------------------------------------\n")
print(textwrap.fill(result['result'], width=80))
print("\n--------------------------------------------------------------------")
print(colored("Sources:", "green"))
print("--------------------------------------------------------------------\n")
for source in sources:
    print(source)



--------------------------------------------------------------------
[32mAnswer:[0m
--------------------------------------------------------------------

 A Long Short-Term Memory (LSTM) model is a type of Recurrent Neural Network
(RNN) that is capable of learning long-term dependencies. It is composed of four
layers: an input layer, a memory unit, a cell state, and an output layer. The
key component of the LSTM is the cell state, which runs straight down the entire
timesteps with only minor but important interactions. LSTM can add or remove
information from the cell state using several gates, each of which is made of a
sigmoid neural network layer. These sigmoid layers produce output numbers
between 0 and 1, which represent how much information is kept or removed from
the cell state. LSTM models can be trained using an optimization algorithm like
gradient descent on a set of training sequences.

--------------------------------------------------------------------
[32mSources:[0m
