In [1]:
import cohere
import numpy as np
import pandas as pd
import pdfplumber
import streamlit as st
from io import StringIO
from typing import Sequence
from dotenv import load_dotenv
import os
from numpy.linalg import norm

In [2]:
load_dotenv()

True

In [3]:
api=os.getenv('API_KEY')

In [4]:
co = cohere.Client(api)

In [5]:
def extractTextFromPdf(pdf_path: str):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

def processTextInput(text: str, run_id: str = None):  
    text = StringIO(text).read()  
    CHUNK_SIZE=150
    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]  

    df = pd.DataFrame.from_dict({'text': chunks}) 
    return df

def convertToList(df):
    df['col']=df[['text']].apply(lambda row: ' '.join(row.dropna().astype(str)), axis=1)
    seqOfStrings: Sequence[str]=df['col'].tolist()
    return seqOfStrings

def embed(Texts: Sequence[str]):
    res=co.embed(texts=Texts, model="small")
    return res.embeddings

In [6]:
# options=st.selectbox("Input type", ["PDF","TEXT"])

# if options=="PDF":
#     pdf_file=st.file_uploader("Upload file", type=["pdf"])
#     if pdf_file is not None:
#         text=extractTextFromPdf(pdf_file)
#     if text is not None:
#         df=processTextInput(text)
# elif options == "TEXT":  
#     text = st.text_area("Paste the Document")  
#     if text is not None:  
#         df = processTextInput(text)

text=extractTextFromPdf('sample.pdf')

In [7]:
df=processTextInput(text)

In [8]:
print(df)

                                                 text
0   React is a powerful JavaScript library for bui...
1   components for web\napplications. Here are som...
2    architecture\nwhere UIs are divided into reus...
3   ctures.\n2. **Virtual DOM**: React uses a virt...
4   reates a virtual\nrepresentation of it in memo...
5    to faster rendering.\n3. **JSX**: JSX (JavaSc...
6   Script. This makes it easier to write and visu...
7   known as Flux or\nRedux. Data flows in one dir...
8    application state.\n5. **Declarative Programm...
9   nd React takes care of updating the DOM to\nma...
10   components have lifecycle methods that\nallow...
11  ng. This enables developers to perform tasks l...
12  ooks are functions that allow developers to\nu...
13   functional components as powerful as class co...
14  brary for routing in React applications. It\na...
15  nt URL, enabling single-page applications (SPA...
16  eusability. Developers can create reusable com...
17  uplication and improving

In [69]:
listOfText=convertToList(df)
embeddings=embed(listOfText)

In [70]:
print(embeddings)

[[2.3359375, 0.44384766, 1.6230469, 1.8125, -0.68066406, 0.86376953, -0.17651367, 1.625, -0.4104004, 0.7207031, -2.3652344, -0.7128906, -0.1303711, -1.2548828, 0.32470703, -2.3007812, 2.6347656, -1.0908203, 0.5283203, -2.453125, -1.7695312, 1.2695312, -0.14782715, 2.9648438, -1.9912109, -1.0927734, 3.4082031, 0.19152832, 0.6801758, -0.63183594, 0.28955078, -0.7163086, 0.85546875, 0.3684082, -2.1699219, -2.2148438, -0.67529297, 1.9238281, -0.50341797, 0.33618164, -0.9614258, -2.296875, -3.7167969, -1.9316406, -3.4199219, -1.9648438, 1.3828125, -1.2363281, -3.0292969, 1.5664062, 1.3417969, -0.89941406, 4.1171875, -1.0458984, -1.9228516, 3.9023438, -2.6347656, 1.1894531, 2.6210938, 0.21118164, 0.89453125, -0.8208008, -0.6591797, -2.8789062, 2.2929688, -0.12524414, 0.45581055, -2.5839844, 0.3798828, 0.37329102, 1.8242188, -2.4023438, -0.49780273, -2.6835938, 0.6665039, -1.4677734, 1.375, 0.045776367, -1.9736328, -0.96435547, -0.8125, 1.2900391, -0.40527344, -1.2900391, -0.58203125, -0.1601

In [71]:
if df is not None:
    prompt="What is the main topic?"
    

In [148]:
def topNNeighbours(prompt_embedding: np.ndarray, storage_embeddings: np.ndarray, df, n: int = 5):  
	if isinstance(storage_embeddings, list):  
		storage_embeddings = np.array(storage_embeddings)  
	if isinstance(prompt_embedding, list):  
		storage_embeddings = np.array(prompt_embedding)  
	similarity_matrix = prompt_embedding @ storage_embeddings.T / np.outer(norm(prompt_embedding, axis=-1), norm(storage_embeddings, axis=-1))  
	num_neighbors = min(similarity_matrix.shape[1], n)  
	indices = np.argsort(similarity_matrix, axis=-1)[:, -num_neighbors:]
	listOfStr=df.values.tolist()
	neighbourValues:list=[]
	for idx in indices[0]:
		neighbourValues.append(listOfStr[idx])
	# return len(listOfStr)
	# return indices
	return neighbourValues

In [158]:
def generate(promptt):
    res=co.generate(prompt=promptt)
    return res

In [159]:
if df is not None and prompt != "":
    base_prompt = "Based on the passage above, answer the following question:"
    prompt_embedding = embed([prompt])
    aug_prompts = topNNeighbours(np.array(prompt_embedding), embeddings, df)
    # print(aug_prompts)
    new_prompt = '\n'.join(str(idx) for idx in aug_prompts) + '\n\n' + base_prompt + '\n' + prompt + '\n'
    print(type(new_prompt))
    # is_success = False
    # while not is_success:
    #     try:
    #         response = generate(new_prompt)
    #         print(response)
    #         is_success = True
    #     except Exception:
    #         aug_prompts = aug_prompts[:-1]
    #         new_prompt = '\n'.join(aug_prompts) + '\n' + base_prompt + '\n' + prompt  + '\n'

    # st.write(response.generations[0].text)

<class 'str'>


In [160]:
res=generate(new_prompt)

In [162]:
print(res.generations[0].text)

 The main topic of the passage is the React JavaScript library, its features, and its advantages.  React is a popular library for building interactive and scalable user interfaces, promoting declarative programming and component-based architecture.  It emphasizes the power of functional components and its benefits for maintainability and scalability.  React also has a rich ecosystem of supporting libraries and community support.  The passage mentions some key features of React including component reusability, declarative programming, rich ecosystem, and React Router for managing single page application state.  Overall, the passage is an overview of React and its strengths as a UI development framework. 
