This notebook takes in multiple PDF files from a folder and can answer questions based on multiple files

In [None]:
!pip install -qq langchain_community
!pip install -qq tiktoken
!pip install -qq langchain-openai
!pip install -qq chromadb
!pip install -qq langchain
!pip install -qq pypdf

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads
from operator import itemgetter
from getpass import getpass
import os

In [None]:
from IPython.display import display, Markdown
def md(t):
  display(Markdown(t))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
folder_path = '/content/drive/My Drive/RAG'

In [None]:
list(os.listdir(folder_path))

In [None]:
pdf_files = [file_name for file_name in list(os.listdir(folder_path)) if file_name.endswith('.pdf')]

In [None]:
pdf_contents = {}
for pdf in pdf_files:
  pdf_path = os.path.join(folder_path,pdf)
  loader = PyPDFLoader(pdf_path)
  pdf_contents[pdf]=loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50)

In [None]:
all_splits = []
for doc_content in pdf_contents.values():
  split = text_splitter.split_documents(doc_content)
  all_splits.extend(split)

In [None]:
os.environ['OPENAI_API_KEY'] = getpass("Enter your OpenAI API Key...")

In [None]:
print("API Key Set:", "OPENAI_API_KEY" in os.environ)  # This should print True if set


In [None]:
api_key = os.environ.get('OPENAI_API_KEY')

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits,
                                    embedding=OpenAIEmbeddings())

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
template = """Answer the question based only on the following context:
{context}.
Also provide the source of the answer with page number from the document.
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

In [None]:
def get_query_response(question):
    docs = retriever.invoke(question)
    for i in range(len(docs)):
        print(f'Relevant Chunk Number- {i}')
        page_content = docs[i].page_content
        print("Page Content:")
        md(page_content)
        page_number = docs[i].metadata['page']
        source = docs[i].metadata['source']
        print('-------------')
        print("Page Number: ",page_number)
        print('-------------')
        print("Source: ",source)
        print('-----------------')

In [None]:
# how are you answering questions - based on contextual info which is given by the retriever
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
question = """Bigtable maintains data in lexicographic order by row keys. The row range
(or tablet) for a table is dynamically partitioned, and the tablet is the unit
of distribution and load balancing. Is this true of false"""

In [None]:
response = rag_chain.invoke(question)
md(response)

In [None]:
question = """Bigtable supports cross-row transactions to perform atomic read-modify- write sequences on data stored under several row keys."""

In [None]:
response = rag_chain.invoke(question)
md(response)

In [None]:
question = """In Bigtable, every read and write of data under a single row key is atomic (regardless of number of different columns being read or written in the
row)."""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """Bigtable prefer a bigger number of distinct column families in a table. True or Flase"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """Bigtable is a column-family NoSQL using GFS store and a column family
must be created before data can be stored. True or Flase"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """Bigtable uses garbage collection based on timestamps to either only the
last n versions of a cell be kept or only new-enough versions be kept, and different versions of a cell are stored in increasing timestamp order (so that the least recent versions can be read first). True or Flase"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """GFSduplicatesdatatonreplicas.
a) How to fully utilize each machine’s network bandwidth? b) How to avoid network bottlenecks?
c) How to minimize the latency to push through all data? """

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """The Google File System uses a master to store 64-bytes metadata about each file, each file is divided into 64-MB chunks. If the master node has 8GB main memory,
a) How many total chunks can be supported? What is the total size of the GFS
system?
b) If there are n files and the average size of files is m-MB, and assume there
is no aggregation of small files, how many total space is wasted due to internal fragmentation? """

response = rag_chain.invoke(question)
md(response)

In [None]:
question ='''Jim Gray’s five minute rule (1986) says “pages referenced every five minutes should be memory resident.” It is purely an economic issue: when it is cheaper to keep a record in main memory rather than access it on disk. Then any 1KB record accessed more frequently than every 300 seconds (or 5 minutes) should live in main memory. Jim Gray’s five byte rule says “Spend five bytes of main memory to save one instruction per second”: when does it make sense to use more memory to save CPU power, or conversely save some memory at the expense of CPU cycles. A Tandem disk cost $15K can deliver 15 accesses/sec, and extra CPU and channel cost for supporting a disk access are $0.5K/access/sec. A megabyte of Tandem memory cost $5K, and instruction cost is $25K/mips.
a) Please show how you derive this five minute rule?
b) Please show how you derive this five byte rule?
c) For smaller (or larger) records, are you expecting the time should be longer
or shorter than 5 minutes? Please show how you derive this five byte rule? For smaller (or larger) records, are you prefer to trade memory for CPU time or not?'''

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """Suppose we have a table urls: (url, category, pagerank). The following is a simple SQL query that finds, for each sufficiently large category, the average pagerank of high-pagerank urls in that category. Please write a PigLatin program that do the same thing.
SELECT category, AVG(pagerank)
FROM urls WHERE pagerank > 0.2
GROUP BY category HAVING COUNT(*) > 106"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """  A distributed system (e.g., RAMCloud) with n servers (each has m GB main memory and a disk with maximum transfer
rate of t MB/sec) connected by c Gbps links (i.e., each node has maximum c Gbps network speed). The system uses log-structured memory
system with one master data in main memory and r backup copies in disks at different servers. If a server crashed, a recovery server will load data from backup copies parallelly.
a) If m = 64, t = 100, c = 10, and r = 3, how long the recovery server can
restore the 64 MB master data?
b) If we change r = 1000, how long the recovery server can restore the 64 MB
master data?
c) Generally, less than 2 seconds recovery is fast enough to constitute
“continuous availability” for most applications.
If we partition the master data into r chunks and store them on different servers and use r recovery
servers to restore them in parallel, what is the minimum integer value of r can we reduce restore time less than 2 second?
(E.g., this is the fast recovery scheme used in RAMCloud)"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """  For the input via a socket as below:
       pencil
       book
       key
       pencil
       book
       key
       Cup
       book
       pencil
       paper
       cup
       book
       pencil
       paper
       book
What is the output of the following code?
from pyspark.sql import SparkSession from pyspark.sql.functions import * from pyspark.sql.types import *
# Streaming via socket
spark = SparkSession.builder.appName("streamCsv").getOrCreate() rawdata = spark.readStream.format("socket").option(
"host", "localhost").option("port",9999).option(
"includeTimestamp", True).load()
query = rawdata.select((rawdata.value).alias("product"),
(rawdata.timestamp).alias("time")).groupBy(window("time", "1
minutes"), "product").count().sort(desc("window")) result = query.writeStream.format("console").outputMode(
"complete").start().awaitTermination() result.stop()
Please give the output.result. Print the result"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """Given the stream data via a file under a directory “test” as below:id,firstname,age,profession,city,salary 100,Babita,27,Lawyer,Riverside,1558.0
101,Zsa Zsa,39,Musician,Male,5667.0 102,Vonny,48,Plice Officer,Bahia Blanca,7612.0 103,Ermengarde,24,Teacher,Porto Alegre,2451.0 104,Karina,51,Software Developer,Amritsar,3522.0 105,Felice,36,Doctor,Montreal,9874.0 106,Elsie,55,Police Officer,City of San Marino,2231.0 107,Kaia,36,Police Officer,Gaza,2263.0 108,Glynnis,43,Designer,Hamburg,6983.0 109,Jany,28,Lawyer,Belize City,8769.0
Given the following partial pyspark code, please replace “pass” to working code to make it working, i.e., output the result as the table below:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import * # The schema
schema1 = StructType([
pass
])
# read files from test directory into a spark dataframe customer = spark.readStream.format("csv").schema(schema1)\ .option("header", True).option("maxFilesPerTrigger", 1)\ .load(r"test ")
# generate spark dataframe with sorted average salaries with counts
# for each profession
average_salaries = pass
# output stream to stdout and start streaming
query = pass"""

response = rag_chain.invoke(question)
md(response)

In [None]:
question = """   Given the pyspark partial code below (with output in comments), please replace “pass” with working code to make it working.
from pyspark import *
sc = SparkContext.getOrCreate()
sc.addPyFile("/Library/Frameworks/Python.framework/Versions/3.7/l ib/python3.7/site-packages/pyspark/jars/graphframes-0.8.2- spark3.1-s_2.12.jar")
from pyspark.sql import *
from graphframes import * vertices = spark.createDataFrame(
[
('1', 'Carter', 'Derrick', 50), ('2', 'May', 'Derrick', 26), ('3', 'Mills', 'Jeff', 80), ('4', 'Hood', 'Robert', 65), ('5', 'Banks', 'Mike', 93), ('98', 'Berg', 'Tim', 28), ('99', 'Page', 'Allan', 16)
], ['id', 'name', 'firstname', 'age'] )
edges = spark.createDataFrame( [
('1', '2', 'friend'), ('2', '1', 'friend'), ('3', '1', 'friend'), ('1', '3', 'friend'), ('2', '3', 'follows'), ('3', '4', 'friend'), ('4', '3', 'friend'), ('5', '3', 'friend'), ('3', '5', 'friend'), ('4', '5', 'follows'), ('98', '99', 'friend'), ('99', '98', 'friend')
], ['src', 'dst', 'type'] )
g = GraphFrame(vertices, edges)
g.edges.show()

copy = edges
from pyspark.sql.functions import udf
# merge directed relationships into undirected ones @udf("string")
def to_undir(src, dst):
pass
copy.withColumn('undir', to_undir(copy.src, copy.dst))\
.filter('undir ==

g.edges.filter('type == "friend"') sc.setCheckpointDir('graphframes_cps') g.find("(a)-[e]->(b); (b)-[e2]->(a)").show()
mutualFriends = pass
mutualFriends.filter('a.id == 2 and c.id == 3').show() """

response = rag_chain.invoke(question)
md(response)