# Data

In [1]:
!pip install -q langchain
!pip install -q wikipedia

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.2/806.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.4/252.4 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.5/64.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.5/138.5 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [2]:
from langchain.document_loaders import WikipediaLoader

In [3]:
query1 = "Nikola Tesla"
doc1 = WikipediaLoader(query1, load_max_docs=1, doc_content_chars_max=20000).load()
doc1_content = doc1[0].page_content

query2 = "Thomas Edison"
doc2 = WikipediaLoader(query2, load_max_docs=1, doc_content_chars_max=20000).load()
doc2_content = doc2[0].page_content

In [4]:
# print(doc1_content)

In [5]:
# print(doc2_content)

In [6]:
docs = [doc1_content, doc2_content]
metadata = [{"document":query1}, {"document":query2}]

# Text Splitter

In [7]:
!pip install -q tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
from langchain.text_splitter import NLTKTextSplitter
# from langchain.text_splitter import TokenTextSplitter

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
splitter = NLTKTextSplitter(chunk_size=355, chunk_overlap=50)
chunks = splitter.create_documents(docs, metadatas=metadata)



In [12]:
len(chunks)

150

In [13]:
chunks[0]

Document(page_content='Nikola Tesla (; Serbian Cyrillic: Никола Тесла, [nǐkola têsla]; 10 July [O.S.\n\n28 June] 1856 – 7 January 1943) was a Serbian-American inventor, electrical engineer, mechanical engineer, and futurist.', metadata={'document': 'Nikola Tesla'})

In [14]:
chunks[0].page_content

'Nikola Tesla (; Serbian Cyrillic: Никола Тесла, [nǐkola têsla]; 10 July [O.S.\n\n28 June] 1856 – 7 January 1943) was a Serbian-American inventor, electrical engineer, mechanical engineer, and futurist.'

In [15]:
chunks[1].page_content

'He is best known for his contributions to the design of the modern alternating current (AC) electricity supply system.Born and raised in the Austrian Empire, Tesla first studied engineering and physics in the 1870s without receiving a degree.'

In [16]:
chunks[1].metadata

{'document': 'Nikola Tesla'}

# Embedding

In [198]:
!pip install -q cohere
# !pip install -q openai

In [199]:
import os
os.environ['tokenAPI'] = "mTDpM65037EBGKXXLskoy5mcBp1hkHyO7sz9LNgC"

In [200]:
from langchain.embeddings import CohereEmbeddings
# from langchain.embeddings import OpenAIEmbeddings

embed = CohereEmbeddings(cohere_api_key = os.environ['tokenAPI'])

In [201]:
test_embeds = embed.embed_documents(['Hi there', 'i love LLM'])
print(len(test_embeds), len(test_embeds[0]))

2 4096


In [202]:
contents = [chunk.page_content for chunk in chunks]
print(len(contents), len(chunks)==len(contents))

150 True


In [203]:
contents[0]

'Nikola Tesla (; Serbian Cyrillic: Никола Тесла, [nǐkola têsla]; 10 July [O.S.\n\n28 June] 1856 – 7 January 1943) was a Serbian-American inventor, electrical engineer, mechanical engineer, and futurist.'

In [204]:
embeddings = embed.embed_documents(contents)

In [205]:
qe_test = embed.embed_query('When Nikola was born?')
len(qe_test)

4096

# Vector Store (Indexing)

### FAISS

In [207]:
!pip install -q faiss-cpu

In [208]:
from langchain.vectorstores import FAISS

vector_db = FAISS.from_documents(chunks, embed)

In [209]:
similar_docs = vector_db.similarity_search('When did tesla die?', k=3)
len(similar_docs)

3

In [210]:
similar_docs[0]

Document(page_content="He died in New York City in January 1943.\n\nTesla's work fell into relative obscurity following his death, until 1960, when the General Conference on Weights and Measures named the International System of Units (SI) measurement of magnetic flux density the tesla in his honor.\n\nThere has been a resurgence in popular interest in Tesla since the 1990s.", metadata={'document': 'Nikola Tesla'})

### Chroma

In [211]:
!pip install -q chromadb

In [212]:
db_dir = 'chroma_db'
if not os.path.exists(db_dir): os.mkdir(db_dir)

In [213]:
docs_ids = list(range(len(chunks)))
len(docs_ids)

150

In [214]:
docs_ids = list(map(str, docs_ids))
docs_ids[:11]

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [215]:
from langchain.vectorstores import Chroma

vector_db = Chroma.from_documents(
    chunks, embed,
    ids = docs_ids,
    persist_directory = db_dir
)

In [216]:
similar_docs = vector_db.similarity_search('When did tesla die?', k=3)
len(similar_docs)

3

In [217]:
similar_docs[0]

Document(page_content="He died in New York City in January 1943.\n\nTesla's work fell into relative obscurity following his death, until 1960, when the General Conference on Weights and Measures named the International System of Units (SI) measurement of magnetic flux density the tesla in his honor.\n\nThere has been a resurgence in popular interest in Tesla since the 1990s.", metadata={'document': 'Nikola Tesla'})

In [218]:
similar_docs = vector_db.similarity_search('When did tesla die?', k=3, filter={'document':'Nikola Tesla'})
len(similar_docs)

3

In [219]:
vector_db.persist() # Save the vector_db to the persist_directory

In [220]:
vector_db = Chroma(
    persist_directory = db_dir,
    embedding_function = embed
)

# QnA Chains

### Stuff Chain

In [114]:
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import Cohere

In [166]:
qna_template = '''
Answer the next question using the provided context.
If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
### Context:
{context}

### Question:
{question}

### Answer:
'''


print(qna_template)


Answer the next question using the provided context.
If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
### Context:
{context}

### Question:
{question}

### Answer:



In [141]:
qna_prompt = PromptTemplate(
    template=qna_template,
    input_variables=['context', 'question'],
    verbose=True
)

In [179]:
llm = Cohere(
    cohere_api_key = os.environ['tokenAPI'],
    temperature=0.5
)


# llm = OpenAI(
#     openai_api_key=openai_api_key,
#     model_name="text-davinci-003",
#     temperature=0.5
# )

In [169]:
stuff_chain = load_qa_chain(llm, prompt = qna_prompt) # chain_type = 'stuff' by default
type(stuff_chain)

In [167]:
query = 'What did Edison invent?'
similar_docs = vector_db.similarity_search(query, k=3)
len(similar_docs)

3

In [170]:
inp_dict = {
    'input_documents' : similar_docs,
    'question' : query
}

ans = stuff_chain(inp_dict, return_only_outputs = True)

ans

{'output_text': ' Edison invented the phonograph, one of the first devices for recording and playing back sound. This invention allowed for the first steps toward mass communication, which would eventually be developed further by Edison and others. '}

In [171]:
inp_dict = {
    'input_documents' : similar_docs,
    'question' : query
}

ans = stuff_chain(inp_dict)

ans.keys()

dict_keys(['input_documents', 'question', 'output_text'])

In [172]:
query = 'Who is Abdullah Ayad?'

inp_dict = {
    'input_documents' : vector_db.similarity_search(query, k=3),
    'question' : query
}

ans = stuff_chain(stuff, return_only_outputs = True)

ans

{'output_text': ' NO ANSWER IS AVAILABLE'}

### MapReduce Chain

In [153]:
compine_template = '''
"Given intermediate contexts for a question, generate a final answer.
If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
### Summaries:
{summaries}

### Question:
{question}

### Answer:
'''

print(compine_template)

combine_prompt = PromptTemplate(
    template = compine_template,
    input_variables = ['summaries', 'question'],
)


"Given intermediate contexts for a question, generate a final answer.
If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
### Summaries:
{summaries}

### Question:
{question}

### Answer:



In [149]:
query = 'Where is Edison from?'

similar_docs = vector_db.similarity_search(query = query, k=3)

In [235]:
map_reduce_chain = load_qa_chain(
    llm, chain_type = "map_reduce",
    question_prompt = qna_prompt,
    combine_prompt = combine_prompt,
    return_intermediate_steps = True,
)

In [237]:
inp_dict = {
    'input_documents' : similar_docs,
    'question' : query
}

ans = map_reduce_chain(inp_dict, return_only_outputs = True)

ans

### Refine Chain

In [194]:
initial_qna_template = '''
Answer the following question using the provided text only.
If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
### Context:
{context_str}

### Question:
{question}

### Answer:
'''

initial_qna_prompt = PromptTemplate(
    template=initial_qna_template,
    input_variables=['context_str', 'question']
)

In [229]:
refine_qna_template = '''
Refine the existing answer, if required, with the following context.
If the answer is not contained in the context, say 'NO ANSWER IS AVAILABLE'
### Context:
{context}

### Existing Answer:
{existing_answer}

### Question:
{question}

### Refined Answer:
'''

refine_qna_prompt = PromptTemplate(
    template=initial_qna_template,
    input_variables=['context', 'existing_answer', 'question']
)

In [230]:
refine_chain = load_qa_chain(
    llm, chain_type="refine",
    question_prompt=initial_qna_prompt,
    refine_prompt=refine_qna_prompt,
    return_intermediate_steps=True,
)

In [231]:
question = "What did Tesla invent?"
similar_docs = vector_db.similarity_search(question, k=2)
len(similar_docs)

2

In [232]:
inp_dict = {
    "input_documents": similar_docs,
    "question": question
}

final_refined_answer = refine_chain(inp_dict)

final_refined_answer.keys()

dict_keys(['input_documents', 'question', 'intermediate_steps', 'output_text'])

In [233]:
final_refined_answer = refine_chain(inp_dict, return_only_outputs=True)

final_refined_answer

{'intermediate_steps': [' Tesla invented a wirelessly controlled boat, mechanical oscillators/generators, electrical discharge tubes, and early X-ray imaging devices. \n\nWould you like me to provide more information on any of these inventions? \nNO ANSWER IS AVAILABLE UNLESS CHANGES ARE MADE TO THE ORIGINAL QUESTION OR CONTEXT. ',
  ' Tesla invented many devices that played an important part in the advancement of science and technology, including an alternating current (AC) electrical system, fluorescent and neon lighting, the Tesla coil, wireless communication, X-ray technology, remote control, robotics, and the rotating magnetic field principle (basis of the AC motor). '],
 'output_text': ' Tesla invented many devices that played an important part in the advancement of science and technology, including an alternating current (AC) electrical system, fluorescent and neon lighting, the Tesla coil, wireless communication, X-ray technology, remote control, robotics, and the rotating magn

In [234]:
len(final_refined_answer['intermediate_steps'])

2