In [81]:
from langchain.llms import CTransformers
from langchain.chains import QAGenerationChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.prompts import PromptTemplate
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import RetrievalQA
import os 
import json
import time
import uvicorn
from PyPDF2 import PdfReader
import csv
import pandas as pd 
import openai
import os
from langchain.chat_models import ChatOpenAI
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import OpenAIEmbeddings

In [82]:
#I tried importing mistral AI, but it was throwing token issues.

#def load_llm():
    # Load the locally downloaded model here
 #   llm = CTransformers(
 #       model = "/home/user/Downloads/mistral-7b-instruct-v0.1.Q4_K_S.gguf",
 #       model_type="mistral",
 #       max_new_tokens = 1048,
 #       temperature = 0.3,
 #       context_length= 200
 #   )
 #   return llm


In [83]:
os.environ["OPENAI_API_TYPE"]       = ""
os.environ["OPENAI_API_BASE"]       = ""
os.environ["OPENAI_API_KEY"]        = ""
os.environ["OPENAI_API_VERSION"]    = ""
openai.api_type     = os.environ["OPENAI_API_TYPE"] 
openai.api_base     = os.environ["OPENAI_API_BASE"]
openai.api_key      = os.environ["OPENAI_API_KEY"] 
openai.api_version  = os.environ["OPENAI_API_VERSION"]


In [84]:
llm= AzureChatOpenAI(deployment_name='tnhgpt3' , verbose=False , temperature=0)


In [93]:
###### I found it necessary to clean docs because it contain unnecessary punctuations, which is increasing the execution times.
import string
def clean_document(doc):
    # Define translation table to remove punctuation
    translator = str.maketrans('', '', string.punctuation + string.digits)

    # Remove punctuation and special characters
    cleaned_text = doc.page_content.translate(translator)
    
    # Remove extra whitespace
    cleaned_text = ' '.join(cleaned_text.split())

    return Document(page_content=cleaned_text)

In [94]:
######################################### Load the PDF file to process 

def file_processing(file_path):

    # Load data from PDF
    loader = PyPDFLoader(file_path)
    data = loader.load()

    question_gen = ''

    for page in data:
        question_gen += page.page_content
        
    splitter_ques_gen = RecursiveCharacterTextSplitter(
        chunk_size = 1000,   #### very important to play around with this parameter.
        chunk_overlap = 50
    )

    chunks_ques_gen = splitter_ques_gen.split_text(question_gen)

    document_ques_gen = [Document(page_content=t) for t in chunks_ques_gen]

    splitter_ans_gen = RecursiveCharacterTextSplitter(
        chunk_size = 500, #### very important to play around with this parameter.
        chunk_overlap = 100
    )


    document_answer_gen = splitter_ans_gen.split_documents(
        document_ques_gen
    )

    # Clean the documents
    cleaned_document_ques_gen = [clean_document(doc) for doc in document_ques_gen]
    cleaned_document_answer_gen = [clean_document(doc) for doc in document_answer_gen]


    return cleaned_document_ques_gen, cleaned_document_answer_gen

In [95]:
############## reading the document into 2 parts.
document_ques_gen, document_answer_gen = file_processing(file_path='/home/user/Downloads/docs-pdf/howto-regex.pdf')

In [96]:
document_ques_gen[0]

Document(page_content='Regular Expression HOWTO Release Guido van Rossum and the Python development team April Python Software Foundation Email docspythonorg Contents Introduction Simple Patterns MatchingCharacters RepeatingThings Using Regular Expressions CompilingRegularExpressions TheBackslashPlague PerformingMatches ModuleLevelFunctions CompilationFlags')

In [97]:
prompt_template = """
    You are an expert at creating questions based on  documentation.
    You do this by asking questions about the text below:

    ------------
    {text}
    ------------

    Create  questions that will prepare the end users.
    Make sure not to lose any important information.

    QUESTIONS:
    """

PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=["text"])

In [98]:
refine_template = ("""
    You are an expert at creating practice questions based on  material and documentation.
    We have received some practice questions to a certain extent: {existing_answer}.
    We have the option to refine the existing questions or add new ones.
    (only if necessary) with some more context below.
    ------------
    {text}
    ------------

    Given the new context, refine the original questions in English.
    If the context is not helpful, please provide the original questions.
    QUESTIONS:
    """
    )

REFINE_PROMPT_QUESTIONS = PromptTemplate(
    input_variables=["existing_answer", "text"],
    template=refine_template,
)

In [99]:
ques_gen_chain = load_summarize_chain(llm = llm, 
                                            chain_type = "refine", 
                                            verbose = True, 
                                            question_prompt=PROMPT_QUESTIONS, 
                                            refine_prompt=REFINE_PROMPT_QUESTIONS)

In [100]:
######### For generating questions we passed the pdf text into summarize chain.
ques = ques_gen_chain.run(document_ques_gen[::-1])



[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    You are an expert at creating questions based on  documentation.
    You do this by asking questions about the text below:

    ------------
    wereunclearorProblemsyouencounteredthatweren’tcoveredhere Ifsopleasesendsuggestionsforimprovements totheauthor The most complete book on regular expressions is almost certainly Jeffrey Friedl’s Mastering Regular Expressions published by O’Reilly Unfortunately it exclusively concentrates on Perl and Java’s flavours of regular expressions anddoesn’tcontainanyPythonmaterialatallsoitwon’tbeusefulasareferenceforprogramminginPython The firsteditioncoveredPython’snowremoved regexmodulewhichwon’thelpyoumuch Considercheckingitout fromyourlibrary
    ------------

    Create  questions that will prepare the end users.
    Make sure not to lose any important information.

    QUESTIONS:
    [0m

[1m> Finished chai

In [101]:
############# Convert text of Questions to proper Questions list.
ques_list = ques.split("\n")
filtered_ques_list = [element for element in ques_list if element.endswith('?') or element.endswith('.')]


In [102]:
############ Here we have used embedding from hugiing face, you can use any.

embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vector_store = FAISS.from_documents(document_answer_gen[::-1], embeddings)

In [103]:

############ Retrieval QA chain to generate answers from Qusetions stored in FAISS db.

answer_generation_chain = RetrievalQA.from_chain_type(llm=llm, 
                                                chain_type="stuff", 
                                                retriever=vector_store.as_retriever())

In [104]:
all_anwers=[answer_generation_chain.run(question) for question in filtered_ques_list]

In [110]:
Q_A_df=pd.DataFrame({"Questions":ques_list,"Answers":all_anwers})

In [111]:
Q_A_df

Unnamed: 0,Questions,Answers
0,1. What is the title of the most comprehensive...,The title of the most comprehensive book on re...
1,"2. Who is the author of the book ""Mastering Re...","The author of the book ""Mastering Regular Expr..."
2,3. Which programming languages' flavors of reg...,The book exclusively concentrates on Perl and ...
3,4. Is there any Python material included in th...,"No, the book does not contain any Python mater..."
4,5. What is the name of the Python module that ...,The name of the Python module that was covered...
5,6. Where can you find a copy of the book if yo...,"You can try checking out a copy of the book ""M..."
6,7. What is the effect of the VERBOSE flag in r...,The VERBOSE flag in regular expressions allows...
7,8. How does the VERBOSE flag affect whitespace...,The given context does not provide information...
8,9. Can comments be included in regular express...,"Yes, comments can be included in regular expre..."
9,10. How can the VERBOSE flag be used to format...,The VERBOSE flag in regular expressions allows...
