In [1]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader('speech.txt')
text_documents = loader.load()
text_documents

[Document(page_content='This is a testing file', metadata={'source': 'speech.txt'})]

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API")

In [3]:
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader = WebBaseLoader(web_paths=('https://lilianweng.github.io/posts/2023-06-23-agent/',), bs_kwargs=dict(parse_only=bs4.SoupStrainer(
  class_=('post-title', 'post_content', 'post_header')
))) 

text_documents = loader.load()

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('Data_Preprocessing_Combination_to_Improve_the_Perf.pdf')
docs = loader.load()

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

documents = text_splitter.split_documents(docs)
documents[:5]

[Document(page_content='/gid00030/gid00035/gid00032/gid00030/gid00038/gid00001/gid00033/gid00042/gid00045 /gid00001\n/gid00048/gid00043/gid00031/gid00028/gid00047/gid00032/gid00046Citation: Cho, E.; Chang, T.-W.;\nHwang, G. Data Preprocessing\nCombination to Improve the\nPerformance of Quality Classiﬁcation\nin the Manufacturing Process.\nElectronics 2022 ,11, 477. https://\ndoi.org/10.3390/electronics11030477\nAcademic Editor: George\nA. Tsihrintzis\nReceived: 14 January 2022\nAccepted: 5 February 2022\nPublished: 6 February 2022\nPublisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional afﬁl-\niations.\nCopyright: © 2022 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed under the terms and\nconditions of the Creative Commons\nAttribution (CC BY) license (https://\ncreativecommons.org/licenses/by/\n4.0/).\nelectronics\nArticle\nData Preprocessing Combination to Improve the Perf

In [6]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(documents[:20], OpenAIEmbeddings())

  warn_deprecated(


In [7]:
query = 'Who is the author'
result = db.similarity_search(query)
result[0].page_content

'Electronics 2022 ,11, 477 4 of 15\nto deal with the data imbalance problem. Table 2 summarized the related literatures and\ntheir proposed algorithm.\nTable 2. Summary of the literature review.\nAuthor Algorithm\nLamari et al. [17] Hybrid sampling method using SMOTE-ENN\nChawla et al. [18] Combination of methods\nBatista et al. [19] SMOTE-Tomek and SMOTE-ENN\nLiang [20] Hybrid sampling method using bagging\nBranco et al. [21] Research on the imbalanced data problem\n3. Theoretical Background\n3.1. Data Imputation Methodology\nLinear Interpolation: When the values of two points are given, this method performs\na linear calculation according to the straight-line distance to estimate the value located\nbetween the points, and it is the simplest method for replacing missing values.\nPoly Interpolation: As a generalization of linear interpolation, polynomial interpolation\nincreases the computational complexity of linear interpolation because the degree of the'

In [8]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OpenAIEmbeddings

db = FAISS.from_documents(documents[:20], OpenAIEmbeddings())

In [9]:
query = 'Who is the author?'
result = db.similarity_search(query)
result[0]

Document(page_content='Electronics 2022 ,11, 477 4 of 15\nto deal with the data imbalance problem. Table 2 summarized the related literatures and\ntheir proposed algorithm.\nTable 2. Summary of the literature review.\nAuthor Algorithm\nLamari et al. [17] Hybrid sampling method using SMOTE-ENN\nChawla et al. [18] Combination of methods\nBatista et al. [19] SMOTE-Tomek and SMOTE-ENN\nLiang [20] Hybrid sampling method using bagging\nBranco et al. [21] Research on the imbalanced data problem\n3. Theoretical Background\n3.1. Data Imputation Methodology\nLinear Interpolation: When the values of two points are given, this method performs\na linear calculation according to the straight-line distance to estimate the value located\nbetween the points, and it is the simplest method for replacing missing values.\nPoly Interpolation: As a generalization of linear interpolation, polynomial interpolation\nincreases the computational complexity of linear interpolation because the degree of the', metad

In [10]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template('''Answer the following question based only on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if the user finds the answer helpful.
<context>
{context}
</context>
Question: {input}''')


In [11]:
from langchain_community.llms import Ollama

llm = Ollama(model='llama2')

In [12]:
from langchain.chains.combine_documents import create_stuff_documents_chain


document_chain = create_stuff_documents_chain(llm, prompt)

In [13]:
retriever = db.as_retriever()
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001A230565A80>)

In [14]:
from langchain.chains import create_retrieval_chain 

retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [16]:
# Be sure to start ollama before running this cell
response = retrieval_chain.invoke({'input': 'Who is the author?'})

In [None]:
response['answer']