In [107]:
# ! pip install -U unstructured
# ! pip install langchain-community
# ! pip install sentence-transformers
# ! pip install faiss-cpu
# ! pip install -U langchain-core langchain-mistralai
# ! pip install mistralai
# ! pip install pdfminer

In [108]:
import numpy as np
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.document_loaders.pdf import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from langchain_core.messages import HumanMessage
from langchain_mistralai.chat_models import ChatMistralAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from urllib.request import urlretrieve
from keys import MISTRAL_KEY, OPENAI_KEY
import warnings
warnings.filterwarnings('ignore')

# Data Parsing

### PDF and URL Sources About the Propaganda Techniques (21 sources in total)

In [109]:
#PDFs

pdf_documents = [
    "https://www.bxscience.edu/ourpages/auto/2010/5/10/56085562/propaganda.pdf",
    "https://todosobrelacorte.wordpress.com/wp-content/uploads/2012/11/teninbaum-reductio-ad-hitlerum.pdf",
    "https://files.eric.ed.gov/fulltext/EJ1046525.pdf"]

#URLs
url_documents = [
    "https://en.wikipedia.org/wiki/Straw_man",
    "https://en.wikipedia.org/wiki/Red_herring",
    "https://en.wikipedia.org/wiki/Whataboutism",
    "https://en.wikipedia.org/wiki/Fallacy_of_the_single_cause",
    "https://en.wikipedia.org/wiki/Obfuscation",
    "https://en.wikipedia.org/wiki/Argument_from_authority",
    "https://disinfo.detector.media/en/post/how-russian-propaganda-uses-flag-waving-tactics",
    "https://en.wikipedia.org/wiki/Fear,_uncertainty,_and_doubt",
    "https://en.wikipedia.org/wiki/Appeal_to_fear",
    "https://psychology.org.au/about-us/news-and-media/aps-in-the-media/2024/explaining-thought-terminating-cliches-and-why-we",
    "https://en.wikipedia.org/wiki/Thought-terminating_clich%C3%A9",
    "https://en.wikipedia.org/wiki/Bandwagon_effect",
    "https://en.wikipedia.org/wiki/Reductio_ad_Hitlerum",
    "https://propaganda.qcri.org/annotations/definitions.html",
    "https://en.wikipedia.org/wiki/Name_calling",
    "https://en.wikipedia.org/wiki/Loaded_language", 
    "https://en.wikipedia.org/wiki/Exaggeration",
    "https://en.wikipedia.org/wiki/Slogan"]

### Parsing

In [110]:
#Parse URLs via UnstructuredURLLoader from Langchain framework.

loader = UnstructuredURLLoader(urls=url_documents)
docs = loader.load()
print('Type:', type(docs), 'Length:', len(docs))

Type: <class 'list'> Length: 18


In [111]:
#Parse URLs via PyPDFDirectoryLoader from Langchain framework.
#Unfortunately, UnstructuredURLLoader did not give good results.

os.makedirs("pdf_storage", exist_ok=True)
for url in pdf_documents:
    file_path = os.path.join("pdf_storage", url.rpartition("/")[2])
    urlretrieve(url, file_path)

load = PyPDFDirectoryLoader("./pdf_storage/")
documents = load.load()
print('Type:', type(documents), 'Length:', len(documents))

Type: <class 'list'> Length: 52


In [112]:
#Make one global source of propaganda knowledge.

united_docs = docs + documents
len(united_docs)

70

# Data Splitting 

In [113]:
#Split the documents into chunks and choose appropriate text separators.

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
)

chunks = text_splitter.split_documents(united_docs)
print(f'We received {len(chunks)} in total.')

We received 1278 in total.


In [114]:
#One of the chunks.
chunks[33]

Document(page_content='what they attack, or pretend what they attack.)\n\nLuther\'s Latin text does not use the phrase "man of straw". This is used in a widespread early 20th century English translation of his work, the Philadelphia Edition[23]\n\nMy answer is, that this sort of argument is common to all those who write against Luther. They assert the very things they assail, or they set up a man of straw whom they may attack.[24][25]', metadata={'source': 'https://en.wikipedia.org/wiki/Straw_man'})

In [115]:
#Some statistics about the chunks.

avg_doc_length = lambda docu: sum([len(d.page_content) for d in docu])//len(docu)
avg_char_count_pre = avg_doc_length(united_docs)
avg_char_count_post = avg_doc_length(chunks)
print(f'Average length among {len(united_docs)} documents loaded is {avg_char_count_pre} characters.')
print(f'After the split we have {len(chunks)} chunks.')
print(f'Average length among {len(chunks)} documents (after split) is {avg_char_count_post} characters.')

Average length among 70 documents loaded is 6811 characters.
After the split we have 1278 chunks.
Average length among 1278 documents (after split) is 379 characters.


# Embeddings

In [116]:
#Use all-MiniLM-L6-v2 to make the embeddings.

modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

#Initialize an instance of HuggingFaceEmbeddings with the specified parameters.
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

#Save vectors to the local database.
vectorstore_faiss = FAISS.from_documents(chunks, embeddings)
vectorstore_faiss.save_local(folder_path="faiss_db_v3")

In [117]:
#Example of the database usage for similarity_search.

test_query = "What is red hearing?"
searchDocs = vectorstore_faiss.similarity_search(test_query)

In [118]:
#Similarity_search_with_score allows to return not only the documents but also the distance score of the query to them.
#The returned distance score is L2 distance.

searchDocs = vectorstore_faiss.similarity_search_with_score(test_query)
searchDocs

[(Document(page_content='expression “red herring.”84  Like a hound, those subjected to a red herring \nargument are led away from the issue th at had been the focus of the discus-\nsion and urged to follow an observation or claim that may be associated with the original claim, but is not highly relevant to the truth of the issue in \ndispute.\n85 \nOne example of a RAH red he rring argument appeared in Liebovitz v. \nNew York City Transit Authority ,86 where a “highly-regarded member of', metadata={'source': 'pdf_storage/teninbaum-reductio-ad-hitlerum.pdf', 'page': 15}),
  1.1416141),
 (Document(page_content='Until 2008,[13] the figurative sense of "red herring" was thought to originate from a supposed technique of training young scent hounds.[13] There are variations of the story, but according to one version, the pungent red herring would be dragged along a trail until a puppy learned to follow the scent', metadata={'source': 'https://en.wikipedia.org/wiki/Red_herring'}),
  1.200485)

In [119]:
#Example of the database usage for similarity search by vector.

query_embedding = vectorstore_faiss.embedding_function.embed_query(test_query)
relevant_documents = vectorstore_faiss.similarity_search_by_vector(query_embedding)
print(f'{len(relevant_documents)} documents are fetched which are relevant to the query.')
print('----')
for i, rel_doc in enumerate(relevant_documents):
    print(f'## Document {i+1}: {rel_doc.page_content}.......')
    print('---')

4 documents are fetched which are relevant to the query.
----
## Document 1: expression “red herring.”84  Like a hound, those subjected to a red herring 
argument are led away from the issue th at had been the focus of the discus-
sion and urged to follow an observation or claim that may be associated with the original claim, but is not highly relevant to the truth of the issue in 
dispute.
85 
One example of a RAH red he rring argument appeared in Liebovitz v. 
New York City Transit Authority ,86 where a “highly-regarded member of.......
---
## Document 2: Until 2008,[13] the figurative sense of "red herring" was thought to originate from a supposed technique of training young scent hounds.[13] There are variations of the story, but according to one version, the pungent red herring would be dragged along a trail until a puppy learned to follow the scent.......
---
## Document 3: ^ "Episode 148: Hair of the Dog". MythBusters Results. Archived from the original on 31 December 2021. Retr

## LLM

In [120]:
#Invoke Mistral Model via Langchain.

MISTRAL_API_KEY = MISTRAL_KEY
model = "mistral-large-latest"
llm = ChatMistralAI(api_key=MISTRAL_API_KEY, model = model)
messages = [HumanMessage(content="What is the capital of Ukraine?")]
llm.invoke(messages)

AIMessage(content="The capital of Ukraine is Kyiv. It's a beautiful city known for its religious architecture, secular monuments, and history museums. Kyiv is located in the north central part of the country and is a significant industrial, scientific, educational, and cultural center of Eastern Europe.", response_metadata={'token_usage': {'prompt_tokens': 10, 'total_tokens': 69, 'completion_tokens': 59}, 'model': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run-b11a3204-4bfe-406c-880b-2a47e181e28d-0')

# RAG Test

In [121]:
#Test our retrieval system with simple prompt and query.

prompt_template = """

Human: Use the following pieces of context to provide a concise answer to the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer. Provide examples if possible.
<context>
{context}
</context

Question: {question}

Assistant:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

#Let's return 3 the most relevant documents.

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore_faiss.as_retriever(
        search_type="similarity", search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)
answer = qa({"query": test_query})
print(answer['result'])

The term "red herring" is a figurative expression used to describe a logical fallacy or a diversionary tactic. It refers to an argument or piece of information that is introduced to mislead or distract from the main issue or point of discussion. This misdirection is similar to how a pungent red herring (a strongly smoked fish) was once used to train young scent hounds, leading them away from the original scent they were supposed to follow. In a debate or argument, a red herring leads the conversation away from the central topic, focusing instead on a less relevant or unrelated matter. An example of this can be found in the case of Liebovitz v. New York City Transit Authority, where such a tactic was employed.


In [122]:
#Checking sources of this answer.

[doc.metadata for doc in answer["source_documents"]]

[{'source': 'pdf_storage/teninbaum-reductio-ad-hitlerum.pdf', 'page': 15},
 {'source': 'https://en.wikipedia.org/wiki/Red_herring'},
 {'source': 'https://en.wikipedia.org/wiki/Red_herring'}]

# Loading 

In [123]:
#Load a FAISS index to ensure that there is no need to recreate the entire process.

docsearch = FAISS.load_local("faiss_db_v3", embeddings, allow_dangerous_deserialization = True)

In [124]:
#Let's test it.

searchDocs = docsearch.similarity_search(test_query)
searchDocs

[Document(page_content='expression “red herring.”84  Like a hound, those subjected to a red herring \nargument are led away from the issue th at had been the focus of the discus-\nsion and urged to follow an observation or claim that may be associated with the original claim, but is not highly relevant to the truth of the issue in \ndispute.\n85 \nOne example of a RAH red he rring argument appeared in Liebovitz v. \nNew York City Transit Authority ,86 where a “highly-regarded member of', metadata={'source': 'pdf_storage/teninbaum-reductio-ad-hitlerum.pdf', 'page': 15}),
 Document(page_content='Until 2008,[13] the figurative sense of "red herring" was thought to originate from a supposed technique of training young scent hounds.[13] There are variations of the story, but according to one version, the pungent red herring would be dragged along a trail until a puppy learned to follow the scent', metadata={'source': 'https://en.wikipedia.org/wiki/Red_herring'}),
 Document(page_content='^ "