In [3]:
# ! pip install langchain-community
# ! pip install sentence-transformers
# ! pip install faiss-cpu
# ! pip install -U langchain-core langchain-mistralai
# ! pip install mistralai
# ! pip install omegaconf
# ! pip install torch==2.1.0
# ! pip install gradio

In [5]:
import json
import omegaconf
import torch
import seaborn as sns
import gradio as gr
import pandas as pd
import re
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_mistralai.chat_models import ChatMistralAI
from code_base.inference import PropInferenceWrapper
from keys import MISTRAL_KEY, OPENAI_KEY
import warnings
warnings.filterwarnings('ignore')

In [6]:
#Create a Mistral client, load the transformer, and specify some parameters

MISTRAL_API_KEY = MISTRAL_KEY
client = MistralClient(api_key=MISTRAL_API_KEY)
model = "mistral-large-latest"

t_inf = PropInferenceWrapper(
config_path="config.yaml",
chkp_path="best.pth",
tresh=0.5)

prompt_template = """Human: You are a brilliant media expert skilled at explaining manipulation techniques in news articles. Your colleagues have identified several such manipulations but did not provide explanations for their classifications. We trust their judgment as correct. Your task is to logically explain why each one fits its assigned label using using provided context. If you're unsure, simply state that you don't know — avoid making up an answer. Do not doubt labelling of your colleagues. 
Instructions:
Please limit your explanation to up to 20 words for each example.
Never repeat query in your answer!
If extracted fragments are in Russian do not translate them in English. Only give explanations in English.
Format your output as bullet points where each line should look like this: 
label - detected example - explanation (up to 20 words). 
Always (!) begin line with label not with detected example!!! For example:
- Exaggeration, Minimisation - "done next to nothing" - makes something seem less important or smaller than it really is
<context>
{context}
</context
Question: {question}
Assistant:"""

modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

Loading config ...
Config loaded
Loading model ...
Treshold value = 0.5. Recommended treshold = 0.5
Model loaded
Loading tokenizer ...
Tokenizer loaded


# Gradio Interface

In [7]:
def analyze_news(news_article):
    """
    Analyze a news article to identify, categorize and explain manipulative content using various AI tools.

    Args:
        news_article (str): The news article text to be analyzed.

    Returns:
        str: A markdown-formatted string that includes the extracted manipulations and their sources.

    The function relies on several external AI models and services, such as text inferrer for tagging,
    a Mistral AI instance for querying, and a FAISS vector store for retrieval. It also utilizes
    a pre-trained model for embeddings and expects specific model paths and API keys to be predefined.
    """
    #Predict tags for each word in the news article
    result = t_inf.predict(news_article)
    words = result[0]
    tags = result[1]

    #Create a DataFrame from the words and tags
    df = pd.DataFrame(list(zip(words, tags)), columns=['Word', 'Tag'])
    #Filter out the words that are tagged as 'O' 
    filtered_df = df[df['Tag'] != 'O']
    #Strip whitespace from the words
    filtered_df['Word'] = filtered_df['Word'].str.strip()
    #Group words by their tags
    filtered_df['group'] = (filtered_df['Tag'] != filtered_df['Tag'].shift(1)).cumsum()
    grouped = filtered_df.groupby(['Tag', 'group'])['Word'].apply(' '.join).reset_index()
    #Create a dictionary with tags as keys and lists of phrases as values
    manipulation_dict = grouped.groupby('Tag')['Word'].apply(list).to_dict()

    llm = ChatMistralAI(api_key=MISTRAL_API_KEY, model=model, temperature=0.1)
    embeddings = HuggingFaceEmbeddings(
        model_name=modelPath,        # Provide the pre-trained model's path
        model_kwargs=model_kwargs,   # Pass the model configuration options
        encode_kwargs=encode_kwargs  # Pass the encoding options
    )
    #Load a local FAISS database for vector retrieval
    vectorstore_faiss = FAISS.load_local("faiss_db_v3", embeddings, allow_dangerous_deserialization=True)

    #Define the prompt template for the query
    PROMPT = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question"]
    )
    #Initialize a RetrievalQA instance with the defined llm and retriever
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore_faiss.as_retriever(
            search_type="similarity", search_kwargs={"k": 3}
        ),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
    resulting = []
    source_docs = []
    #Generate questions and get answers based on the manipulation dictionary
    for k, v in manipulation_dict.items():
        query = f"Why do the detected manipulations from this list {v} belong to the label {k}?"
        answer = qa({"query": query})
        answer['result'] = answer['result'].strip()
        answer['result'] = re.sub(r'^[ \t]+', '', answer['result'], flags=re.MULTILINE)
        resulting.append(answer['result'])

    #Format the markdown output for the results
    markdown_output = "## Extracted Manipulations\n"
    for r in resulting:
        markdown_output += f"{r}\n"
    markdown_output += "## Sources\n"  
    url_pattern = r"metadata={'source': '([^']+)'}"
    url = re.findall(url_pattern, str(answer['source_documents'][1]))
    source_docs.append(url[0])
    for index, s in enumerate(source_docs, start=1):
        markdown_output += f"{index}. {s}\n"

    return markdown_output

In [9]:
import gradio as gr

# Create the Gradio interface
interface = gr.Interface(
    fn=analyze_news,
    inputs=gr.Textbox(lines=10, label="Enter the news article for analysis", placeholder="Type or paste the news article here..."),
    outputs=gr.Markdown(),
    title="The Manipulation Expert",
    allow_flagging='never')

# Launch the interface
interface.launch()

Running on local URL:  http://127.0.0.1:7861
Sagemaker notebooks may require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Running on public URL: https://cc37fe3ed4d8ebcd79.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


