In [3]:
# ! pip install langchain-community
# ! pip install sentence-transformers
# ! pip install faiss-cpu
# ! pip install -U langchain-core langchain-mistralai
# ! pip install mistralai
# ! pip install omegaconf
# ! pip install torch==2.1.0
# ! pip install gradio
# ! pip install ragas
# ! pip install datasets

In [11]:
import os
import re
import json
import time
import pandas as pd
from datasets import Dataset 
import gradio as gr
import openai
import omegaconf
import torch
from ragas.metrics import (context_precision,
    answer_relevancy,
    faithfulness,
    context_recall,
    context_relevancy,
    context_entity_recall,
    answer_correctness
)
from ragas import evaluate
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_openai.chat_models import ChatOpenAI
from code_base.inference import PropInferenceWrapper
from keys import MISTRAL_KEY, OPENAI_KEY
import warnings
warnings.filterwarnings('ignore')

# Creating Dataset with Techniques and their Explanations

In [5]:
#Let's create dataset with the labels and their explanations.

indices = ['Slogans', 'Black-and-White_Fallacy', 'Loaded_Language', 'Flag-Waving',
 'Name_Calling,Labeling','Whataboutism', 'Causal_Oversimplification', 'Exaggeration,Minimisation',
 'Doubt', 'Appeal_to_Authority', 'Repetition', 'Appeal_to_fear-prejudice', 'Red_Herring',
 'Thought-terminating_Cliches', 'Bandwagon', 'Reductio_ad_hitlerum', 'Obfuscation',
 'Intentional_Vagueness,Confusion', 'Straw_Men']

labels_dataframe = pd.DataFrame(columns = ['ground_truth'], index = indices)

In [6]:
labels_dataframe.loc['Slogans', 'ground_truth'] = "A brief and striking phrase that may include labeling and stereotyping. Slogans tend to act as emotional appeals."
labels_dataframe.loc['Black-and-White_Fallacy', 'ground_truth'] = "Presenting two alternative options as the only possibilities, when in fact more possibilities exist. As an the extreme case, tell the audience exactly what actions to take, eliminating any other possible choices (Dictatorship)."
labels_dataframe.loc['Loaded_Language', 'ground_truth'] = "Using words/phrases with strong emotional implications (positive or negative) to influence an audience"
labels_dataframe.loc['Flag-Waving', 'ground_truth'] = "Playing on strong national feeling (or to any group; e.g., race, gender, political preference) to justify or promote an action or idea"
labels_dataframe.loc['Name_Calling,Labeling', 'ground_truth'] = "Labeling the object of the propaganda campaign as either something the target audience fears, hates, finds undesirable or loves, praises."
labels_dataframe.loc['Whataboutism', 'ground_truth'] = "A technique that attempts to discredit an opponent's position by charging them with hypocrisy without directly disproving their argument."
labels_dataframe.loc['Causal_Oversimplification', 'ground_truth'] = "Assuming a single cause or reason when there are actually multiple causes for an issue. It includes transferring blame to one person or group of people without investigating the complexities of the issue"
labels_dataframe.loc['Exaggeration,Minimisation', 'ground_truth'] = "Either representing something in an excessive manner: making things larger, better, worse (e.g., 'the best of the best', 'quality guaranteed') or making something seem less important or smaller than it really is (e.g., saying that an insult was just a joke)."
labels_dataframe.loc['Doubt', 'ground_truth'] = "Questioning the credibility of someone or something."
labels_dataframe.loc['Appeal_to_Authority', 'ground_truth'] = "Stating that a claim is true simply because a valid authority or expert on the issue said it was true, without any other supporting evidence offered. We consider the special case in which the reference is not an authority or an expert in this technique, altough it is referred to as Testimonial in literature."
labels_dataframe.loc['Repetition', 'ground_truth'] = "Repeating the same message over and over again so that the audience will eventually accept it."
labels_dataframe.loc['Appeal_to_fear-prejudice', 'ground_truth'] = "Seeking to build support for an idea by instilling anxiety and/or panic in the population towards an alternative. In some cases the support is built based on preconceived judgements."
labels_dataframe.loc['Red_Herring', 'ground_truth'] = "Introducing irrelevant material to the issue being discussed, so that everyone's attention is diverted away from the points made."
labels_dataframe.loc['Thought-terminating_Cliches', 'ground_truth'] = "Words or phrases that discourage critical thought and meaningful discussion about a given topic. They are typically short, generic sentences that offer seemingly simple answers to complex questions or that distract attention away from other lines of thought."
labels_dataframe.loc['Bandwagon', 'ground_truth'] = "Attempting to persuade the target audience to join in and take the course of action because 'everyone else is taking the same action'."
labels_dataframe.loc['Reductio_ad_hitlerum', 'ground_truth'] = "Persuading an audience to disapprove an action or idea by suggesting that the idea is popular with groups hated in contempt by the target audience. It can refer to any person or concept with a negative connotation."
labels_dataframe.loc['Obfuscation,Intentional_Vagueness,Confusion', 'ground_truth'] = "Using words which are deliberately not clear so that the audience may have its own interpretations.For example when an unclear phrase with multiple definitions is used within the argument and, therefore, it does not support the conclusion."
labels_dataframe.loc['Straw_Men', 'ground_truth'] = "When an opponent's proposition is substituted with a similar one which is then refuted in place of the original proposition."

In [7]:
#Test

labels_dataframe.loc['Slogans', 'ground_truth']

'A brief and striking phrase that may include labeling and stereotyping. Slogans tend to act as emotional appeals.'

# Test Dataset

In [42]:
#Download test dataset.

test_data = pd.read_csv('test_dataset.csv')
test_data

Unnamed: 0,news,label,location_markers,extracted_manupulation
0,Swedish PM does not rule out use of army to en...,Loaded_Language,"[1243, 1254]",stamped out
1,Swedish PM does not rule out use of army to en...,Flag-Waving,"[1874, 2026]","This is the new Sweden; the new, exciting dyna..."
2,Israel Takes On the Shia Crescent\n\nDespite I...,Loaded_Language,"[86, 106]",reckless appeasement
3,Israel Takes On the Shia Crescent\n\nDespite I...,Loaded_Language,"[820, 830]",disastrous
4,Israel Takes On the Shia Crescent\n\nDespite I...,Loaded_Language,"[2377, 2400]",\n“It’s a new monster.”\n
...,...,...,...,...
922,"SNL Indian Comedian Silenced for ""Offensive Jo...",Slogans,"[191, 213]",the Left killed comedy
923,"SNL Indian Comedian Silenced for ""Offensive Jo...","Exaggeration,Minimisation","[1043, 1148]","no one looks in the mirror and thinks, ‘this b..."
924,"SNL Indian Comedian Silenced for ""Offensive Jo...","Name_Calling,Labeling","[1164, 1184]",Columbia snowflakes
925,"SNL Indian Comedian Silenced for ""Offensive Jo...","Exaggeration,Minimisation","[1607, 1674]","Comrades, these jokes you have been listening ..."


# Creating Dataset for RAG Evaluation

In [43]:
#Create a Mistral client, load the transformer, and specify some parameters

MISTRAL_API_KEY = MISTRAL_KEY
client = MistralClient(api_key=MISTRAL_API_KEY)
model = "mistral-large-latest"

t_inf = PropInferenceWrapper(
config_path="config.yaml",
chkp_path="best.pth",
tresh=0.1)

prompt_template = """Human: You are a brilliant media expert skilled at explaining manipulation techniques in news articles. 
Your colleagues have identified several such manipulations but did not provide explanations for their classifications. 
We trust their judgment as correct. Your task is to logically explain why each one fits its assigned label using using provided context. If you're unsure, simply state that you don't know — avoid making up an answer. 
Do not doubt labelling of your colleagues. 
Instructions:
Please limit your explanation to up to 20 words for each example.
Never repeat query in your answer!
Format your output as bullet points where each line should look like this: 
label - detected example - explanation (up to 15 words). 
Always (!) begin line with label not with detected example!!! For example:
- Exaggeration, Minimisation - "done next to nothing" - makes something seem less important or smaller than it really is
<context>
{context}
</context
Question: {question}
Assistant:"""

modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}


Loading config ...
Config loaded
Loading model ...
Treshold value = 0.1. Recommended treshold = 0.5
Model loaded
Loading tokenizer ...
Tokenizer loaded


In [44]:
def eval_dataset(news_article):
    """
    Prepares the data needed for evaluating manipulative phrases in a news article by processing 
    the text to detect significant phrases, querying a question-answering system for explanations, 
    and organizing the resulting data into lists for further analysis.

    Parameters:
    news_article (str): A string containing the news article text to be analyzed.
    
    """
    # Predict tags for each word in the article
    result = t_inf.predict(news_article)
    words, tags = result[0], result[1]
    
    # Create DataFrame from words and tags, filter out tags 'O'
    df = pd.DataFrame(list(zip(words, tags)), columns=['Word', 'Tag'])
    filtered_df = df[df['Tag'] != 'O']
    filtered_df['Word'] = filtered_df['Word'].str.strip()
    
    # Group words by their tags, create a dictionary of tags and their corresponding phrases
    filtered_df['group'] = (filtered_df['Tag'] != filtered_df['Tag'].shift(1)).cumsum()
    grouped = filtered_df.groupby(['Tag', 'group'])['Word'].apply(' '.join).reset_index()
    manipulation_dict = grouped.groupby('Tag')['Word'].apply(list).to_dict()

    llm = ChatMistralAI(api_key=MISTRAL_API_KEY, model=model, temperature=0.1)
    embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)
    
    # Load a FAISS vector store for efficient similarity search
    vectorstore_faiss = FAISS.load_local("faiss_db_v3", embeddings, allow_dangerous_deserialization=True)

    # Template for generating prompts for the QA model
    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    # Setup a retrieval-based QA system
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore_faiss.as_retriever(
            search_type="similarity", search_kwargs={"k": 3}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
    
    # Generate questions and get answers from the QA system
    for k, v in manipulation_dict.items():
        question = f"Why do the detected manipulations from this list {v} belong to the label {k}?"
        answer = qa({"query": question})
        list_questions.append(question)
        
        # Clean the answer text
        answer['result'] = answer['result'].strip()
        answer['result'] = re.sub(r'^[ \t]+', '', answer['result'], flags=re.MULTILINE)
        list_answers.append(answer['result'])
        
        # Extract and clean page content from source documents
        content_start = str(answer['source_documents']).find("page_content='") + len("page_content='")
        content_end = str(answer['source_documents']).find("metadata", content_start)
        page_content = str(answer['source_documents'])[content_start:content_end]
        page_content = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', page_content)
        list_context.append([page_content])
        
        # Retrieve ground truth for the tag from a predefined DataFrame
        list_ground.append(labels_dataframe.loc[k, 'ground_truth'])

# Creating Evaluation Dataset

In [47]:
#Let's find out how many unique articles contains test dataset.

all_articles = list(test_data['news'].unique())
print(f'There are {len(all_articles)} unique articles in the test dataset.')

There are 57 unique articles in the test dataset.


In [48]:
#Create lists for the future evaluation dataset.
list_questions = []
list_answers = []
list_context = []
list_ground = []

for article in all_articles:
    eval_dataset(article)

100%|██████████| 1/1 [00:00<00:00,  2.00it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.49s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.57s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:03<00:00,  1.73s/it]
100%|██████████| 1/1 [00:00<00:00,  3.32it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.18s/it]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:02<00:00,  1.15s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
100%|██████████| 1/1 [00:00<00:00,  2.38it/s]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:02<00:00,  1.42s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.41s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.06it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
100%|██████████| 1/1 [00:00<00:00,  2.17it/s]
100%|██████████| 1/1 [00:00<00:00,  3.70it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.23it/s]
100%|██████████| 1/1 [00:00<00:00,  3.87it/s]
100%|██████████| 1/1 [00:00<00:00,  2.03it/s]
100%|██████████| 1/1 [00:00<00:00,  1.97it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.22it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.39s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.19it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:03<00:00,  1.69s/it]
100%|██████████| 1/1 [00:00<00:00,  2.86it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.57s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:02<00:00,  2.01s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.19it/s]
100%|██████████| 1/1 [00:00<00:00,  2.44it/s]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:04<00:00,  2.11s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.47s/it]


Slicing is used to handle long samples


100%|██████████| 3/3 [00:06<00:00,  2.01s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.61s/it]


Slicing is used to handle long samples


100%|██████████| 2/2 [00:01<00:00,  1.32it/s]
100%|██████████| 1/1 [00:00<00:00,  2.64it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.40it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.39it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.13it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
100%|██████████| 1/1 [00:00<00:00,  1.77it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.11it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.06s/it]
100%|██████████| 1/1 [00:00<00:00,  4.34it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.19s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.07it/s]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.50s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:01<00:00,  1.07s/it]


Slicing is used to handle long samples


100%|██████████| 1/1 [00:00<00:00,  1.29it/s]


In [50]:
#Save evaluation dataset.

eval_dict = {'question': list_questions, 'answer': list_answers, 'contexts': list_context, 'ground_truth': list_ground}
eval_data = pd.DataFrame(eval_dict)
eval_data.to_csv('Eval_dataset_v3.csv')
print(f'Evaluation dataset with the shape {eval_data.shape} is saved.')

Evaluation dataset with the shape (313, 4) is saved.


# Metrics

In [13]:
#Download saved dataset and prepare it for Ragas evaluation.

dataset_eval = pd.read_csv('Eval_dataset_v3.csv')
dataset_eval['contexts'] = dataset_eval['contexts'].apply(lambda x: [x])
dataset_eval = Dataset.from_pandas(dataset_eval)

In [15]:
#Evaluate 250 samples because of OPENAI tokens limitations.

os.environ["OPENAI_API_KEY"] = OPENAI_KEY
gpt_turbo = ChatOpenAI(model_name="gpt-3.5-turbo-0125")

metrics = [faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        context_relevancy,
        answer_correctness]

scores = []
for m in metrics:
    score = evaluate(dataset_eval.select(range(250)), metrics=[m], llm = gpt_turbo)
    scores.append(score)
    time.sleep(60)

Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Failed to parse output. Returning None.
Failed to parse output. Returning None.


Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.


Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/250 [00:00<?, ?it/s]

Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.
Failed to parse output. Returning None.


In [21]:
#Make pandas dataset for presentation purposes.

data_transformed = {list(d.keys())[0]: list(d.values())[0] for d in scores}
scores_df = pd.DataFrame(list(data_transformed.items()), columns=['Index', 'Scores'])
scores_df.set_index('Index', inplace=True)
scores_df

Unnamed: 0_level_0,Scores
Index,Unnamed: 1_level_1
faithfulness,0.363135
answer_relevancy,0.711509
context_recall,0.786108
context_precision,0.676
context_relevancy,0.564538
answer_correctness,0.730049


The retrieval metrics are generally high. However, the faithfulness score, which measures the factual consistency of the generated answer against the given context, is low. I should work on refining the prompt to better integrate the context.

Additionally, the context relevancy score is low. This metric assesses the relevance of the retrieved context based on both the question and the contexts. To improve this, I might consider not only retrieving the four closest documents but also establishing a threshold. For instance, setting up the retriever with a similarity score threshold might be effective (retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.7}))."