In [19]:
%%time

! pip install -qq -U langchain tiktoken pypdf chromadb faiss-gpu unstructured openai
! pip install -qq -U transformers InstructorEmbedding sentence_transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
CPU times: user 302 ms, sys: 129 ms, total: 431 ms
Wall time: 26.8 s


# Imports

In [33]:
import openai
### Multi-document retriever
from langchain.vectorstores import Chroma, FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA, VectorDBQA
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import SentenceTransformerEmbeddings 

from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [9]:
directory = '/kaggle/input/donald-trumps-rallies'

In [10]:
def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

35

In [11]:
def split_docs(documents,chunk_size=1000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

2786


Embedding Text Using Langchain

In [12]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

# Creating Vector Store with Chroma DB

In [13]:
db = Chroma.from_documents(docs, embeddings)

Batches:   0%|          | 0/88 [00:00<?, ?it/s]

# Retrieving Semantically Similar Documents

In [15]:
query = "Who are considered allies of  Donald Trump?"
matching_docs = db.similarity_search(query)

matching_docs[0]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Document(page_content='very smart guy. We get along. Getting along with foreign powers is not a bad thing. We\'re trying to teach that to the media and these idiots that ran our government for so many years. These fools. These fools. You know, you got nuclear weapons, you got big armies, big navies, "He gets along with Putin. That\'s a terrible thing." No, it\'s a good thing. If I get along, that\'s good. He likes me. I like him. Not so bad. "He gets along with Kim Jong-un." … Not so bad. He gets along with Kim Jong-Un.', metadata={'source': '/kaggle/input/donald-trumps-rallies/FreelandSep10_2020.txt'})

# Persistence in Chroma DB


In [16]:
persist_directory = "chroma_db"

vectordb = Chroma.from_documents(
    documents=docs, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()

Batches:   0%|          | 0/88 [00:00<?, ?it/s]

# Using OpenAI Large Language Models (LLM) with Chroma DB
import os

os.environ["OPENAI_API_KEY"] = "key"

from langchain.chat_models import ChatOpenAI

model_name = "gpt-3.5-turbo"

llm = ChatOpenAI(model_name=model_name)

In [21]:
import os
os.environ["OPENAI_API_KEY"] = ""

from langchain.chat_models import ChatOpenAI
model_name = "gpt-3.5-turbo"
llm = ChatOpenAI(model_name=model_name)

# Extracting Answers from Documents

In [40]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff",verbose=True)

query = "What were the foreign policies of Donald Trump?"
matching_docs = db.similarity_search(query)
answer =  chain.run(input_documents=matching_docs, question=query)
answer

Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
He never did anything about it. Biden's supported every globalist attack on the American worker let's make every country of the world rich but ourselves, including NAFTA, the disaster of NAFTA. He wanted to go and TPP would have ruined our automobile industry. Korea, I renegotiated the deal. I took a horrible deal. That was a Hillary Clinton special. That was a Hillary Clinton special. She said, "We have to make this deal with South Korea. It's going to mean 250,000 jobs." And she was right for South Korea, 250,000. And China's entry into the World Trade Organization was supported by sleepy Joe Biden. That was one of the worst deals economically in the history of our count

"Donald Trump's foreign policies focused on putting America first and prioritizing American workers. He criticized globalist policies such as NAFTA and the Trans-Pacific Partnership (TPP), which he believed were detrimental to the American workforce. He renegotiated trade deals, such as the one with South Korea, in an effort to benefit the United States. Trump also criticized China's entry into the World Trade Organization and accused China of stealing American jobs and intellectual property. He took a tough stance on China, implementing tariffs and engaging in a trade war. Trump also highlighted his opposition to the Iraq War and criticized Joe Biden's support for it. Overall, Trump's foreign policies were characterized by a focus on protecting American interests and challenging what he saw as unfair trade practices."

In [41]:
from langchain.chains import RetrievalQA
retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever())
retrieval_chain.run(query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

"From the given context, it is clear that Donald Trump focused on putting America first in his foreign policies. He criticized globalist approaches like NAFTA and the Trans-Pacific Partnership (TPP), which he believed harmed American workers. He renegotiated deals like the one with South Korea to protect American jobs. Trump also criticized China's entry into the World Trade Organization and accused them of stealing American jobs and intellectual property. He implemented tariffs on Chinese goods and aimed to hold China accountable for its actions. Additionally, Trump opposed the war in Iraq and supported strong immigration enforcement, while criticizing Biden's stance on amnesty and sanctuary cities. These are some of the foreign policy positions and actions associated with Donald Trump based on the provided context."