In [1]:
from tika import parser  # using tika lib for parsing the documents
from langchain.text_splitter import RecursiveCharacterTextSplitter  # using langchain lib for text splitting

In [2]:
parsed_pdf = parser.from_file("C://Users//bpandhawale//Downloads//Test_document_usecase-1.pdf")
data = parsed_pdf['content'] 

# splitting the extracted text into paragraphs
splitter = RecursiveCharacterTextSplitter()
paragraphs = splitter.split_text(text=data)

# cleaning the extracted paragraphs by removing newlines and double periods
def clean_text(text):
    cleaned_string = text.replace("\n","").replace('..',"")
    return cleaned_string
cleaned_paragraphs = [clean_text(para) for para in paragraphs]

In [3]:
from sentence_transformers import SentenceTransformer,CrossEncoder
import faiss

model = SentenceTransformer('msmarco-distilbert-base-v4')  #load a pre-trained sentence embedding model
#embedding
para_embeddings = model.encode(cleaned_paragraphs)
print(f"para_embed: {para_embeddings.shape}")
d = 768                                                    #specify the dimension
k = 10                                                     #number of nearest neighbors to search
#indexing of the embeddings
index = faiss.IndexFlatIP(d)
index.add(para_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


para_embed: (6, 768)


In [4]:
#counting tokens
import tiktoken
def num_tokens_from_string(string, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [5]:
import numpy as np

import openai

openai.api_key = "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"
model_vector = SentenceTransformer('msmarco-distilbert-base-v4')

model_encoder = CrossEncoder("cross-encoder/stsb-roberta-large")

In [7]:
session_data = {}

while True:        
    layout = ""
    print("Layout 1: Heading:  Points: ")
    print("Layout 2: Heading:  Sub-Heading: Points: ")
    
    user_id = input("Enter a user ID: ")  # Prompt the user to enter a user ID
    if user_id not in session_data:
       session_data[user_id] = {}  # Create a new session data dictionary for the user

    x = int(input("Enter Layout 1 or 2 ."))
    if x == 1:
        layout = '''Generate HTML document with following using html tags - 

                    Heading: topic name 
                    Points: Explain action items in two three sentences as points.
                    Keep each point of different color.
                    Use bullets to mark the points.
                    Keep heading center-aligned.
                    Make a grid of four and then place points inside it.

                '''
        
    elif x == 2:
        layout = '''Generate HTML document with following using html tags - 

                    Heading: topic name 
                    Subheading : Action Item name
                    Points: Explaination of that action items as points.
                    Keep each point of black color. Place points in square left-aligned.
                    Keep heading center-aligned.
                    
                '''
    else:
        quit()

    query = f"""List 4 action items from context with explaination."""

    query_vector = model_vector.encode([query])
    D,I = index.search(query_vector, k)
    relevant_indexes=I.tolist()[0]

    relevant_paras = []
    for i in relevant_indexes:
        relevant_paras.append(cleaned_paragraphs[i])

    query_paras_combined = [[query, para] for para in relevant_paras]
    similarity_scores = model_encoder.predict(query_paras_combined)
    sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))

    relevant_context = ""
    threshold = 1       
    for idx in sim_scores_argsort:
        if threshold > 0 and num_tokens_from_string(relevant_context, "p50k_base")+num_tokens_from_string(relevant_paras[idx], "p50k_base")<2700:
            relevant_context += relevant_paras[idx] + "\n\n"
            threshold = threshold - 1
        else:
            break

    #generate an input prompt
    refined_prompt = f"""{query} {layout}
    Contexts:{relevant_context}"""

    print(f"Refined Prompt: {refined_prompt}")

    #Feed input prompt to openai model
    response = openai.Completion.create(    
    engine="text-davinci-003",
    prompt=refined_prompt,
    temperature=0.9,
    max_tokens=1024,
    top_p=1
    )

    print(f"""Response - {response["choices"][0]["text"]}""")
    
     # Store session data
    session_data[user_id]['layout'] = layout
    session_data[user_id]['prompt'] = refined_prompt
    
    # Ask the user if they want to try again
    i = input("Try again? Yes or No: ")
    if i.lower() != "yes":
       
        break  # Exit the loop if the user's input is not "Yes"   
    

Layout 1: Heading:  Points: 
Layout 2: Heading:  Sub-Heading: Points: 
Enter a user ID: 1
Enter 1 or 2 .1
Refined Prompt: List 4 action items from context with explaination. Generate HTML document with following using html tags - 

                    Heading: topic name 
                    Points: Explain action items in two three sentences as points.
                    Keep each point of different color.
                    Use bullets to mark the points.
                    Keep heading center-aligned.
                    Make a grid of four and then place points inside it.

                
    Contexts:4BlockBlockBlock Header (Block Hash)Prev Hash NonceHash01Hash0 Hash1 Hash2 Hash3Hash23Root HashHash01Hash2Tx3Hash23Block Header (Block Hash)Root HashTransactions Hashed in a Merkle Tree After Pruning Tx0-2 from the BlockPrev Hash NonceHash3Tx0 Tx1 Tx2 Tx38. Simplified Payment VerificationIt is possible to verify payments without running a full network node.  A user only needs to k

Response - 
<html>
    
    <h1 style="text-align:center;"><u>4 Block Combining and Splitting Value </u></h1>
    
    <h2><u>Simplified Payment Verification</u></h2>
    <ul style="list-style-type:square;">
        <li style="color:black;">A user only needs to keep a copy of the block headers of the longest proof-of-work chain, which he can get by querying network nodes 
        until he's convinced he has the longest chain, and obtain the Merkle branch linking the transaction to the block it's timestamped in.</li>
        <li style="color:black;">Network nodes can verify transactions for themselves, but the simplified method can be fooled by an attacker's fabricated transactions for as 
        long as the attacker can continue to overpower the network.</li>
        <li style="color:black;">One strategy to protect against this would be to accept alerts from network nodes when they detect an invalid block, prompting the user's 
        software to download the full block and alerted t