### Let's begin!

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
# Used to import data from local.
import pandas as pd

# Used to create the dense document vectors.
import torch
from sentence_transformers import SentenceTransformer

# Used to create and store the Faiss index.
import faiss
import numpy as np

import pickle
from pathlib import Path

# Used to do vector searches and display the results.
from vector_engine.utils import vector_search, id2details

Stored and processed data in s3

In [3]:
# Read a CSV in a table
df = pd.read_csv('data/misinformation_papers.csv')

In [4]:
df.head(3)

Unnamed: 0,original_title,abstract,year,citations,id,is_EN
0,When Corrections Fail: The Persistence of Poli...,An extensive literature addresses citizen igno...,2010,901,2132553681,1
1,A postmodern Pandora's box: anti-vaccination m...,The Internet plays a large role in disseminati...,2010,440,2117485795,1
2,Spread of (Mis)Information in Social Networks,We provide a model to investigate the tension ...,2010,278,2120015072,1


In [5]:
print(f"Misinformation, disinformation and fake news papers: {df.id.unique().shape[0]}")

Misinformation, disinformation and fake news papers: 8430


In [6]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# Check if GPU is available and use it
if torch.cuda.is_available():
    model = model.to(torch.device("cuda"))
print(model.device)

cpu


In [7]:
# Convert abstracts to vectors
embeddings = model.encode(df.abstract.to_list(), show_progress_bar=True)

Batches:   0%|          | 0/264 [00:00<?, ?it/s]

In [8]:
print(f'Shape of the vectorised abstract: {embeddings[0].shape}')

Shape of the vectorised abstract: (768,)


In [9]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.id.values)

print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 8430


In [10]:
# Paper abstract
df.iloc[5415, 1]

"We address the diffusion of information about the COVID-19 with a massive data analysis on Twitter, Instagram, YouTube, Reddit and Gab. We analyze engagement and interest in the COVID-19 topic and provide a differential assessment on the evolution of the discourse on a global scale for each platform and their users. We fit information spreading with epidemic models characterizing the basic reproduction number [Formula: see text] for each social media platform. Moreover, we identify information spreading from questionable sources, finding different volumes of misinformation in each platform. However, information from both reliable and questionable sources do not present different spreading patterns. Finally, we provide platform-dependent numerical estimates of rumors' amplification."

In [11]:
# Retrieve the 10 nearest neighbours
D, I = index.search(np.array([embeddings[5415]]), k=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

L2 distance: [0.0, 1.374990701675415, 55.5548210144043, 65.87553405761719, 67.96589660644531, 69.05731964111328, 69.70642852783203, 70.40653991699219, 70.57283020019531, 71.03385925292969]

MAG paper IDs: [3092618151, 3011345566, 3012936764, 3011186656, 3092128270, 3048848247, 3044429417, 3055557295, 3024620668, 3044097955]


In [12]:
# Fetch the paper titles based on their index
id2details(df, I, 'original_title')

[['The COVID-19 social media infodemic.'],
 ['The COVID-19 Social Media Infodemic'],
 ['Understanding the perception of COVID-19 policies by mining a multilanguage Twitter dataset'],
 ['Coronavirus Goes Viral: Quantifying the COVID-19 Misinformation Epidemic on Twitter'],
 ['Analysis of online misinformation during the peak of the COVID-19 pandemics in Italy'],
 ['COVID-19-Related Infodemic and Its Impact on Public Health: A Global Social Media Analysis.'],
 ['Effects of misinformation on COVID-19 individual responses and recommendations for resilience of disastrous consequences of misinformation'],
 ['Covid-19 infodemic reveals new tipping point epidemiology and a revised R formula.'],
 ['Quantifying COVID-19 Content in the Online Health Opinion War Using Machine Learning'],
 ['Coronavirus-related online web search desire amidst the rising novel coronavirus incidence in Ethiopia: Google Trends-based infodemiology']]

In [13]:
id2details(df, I, 'original_title')

[["We address the diffusion of information about the COVID-19 with a massive data analysis on Twitter, Instagram, YouTube, Reddit and Gab. We analyze engagement and interest in the COVID-19 topic and provide a differential assessment on the evolution of the discourse on a global scale for each platform and their users. We fit information spreading with epidemic models characterizing the basic reproduction number [Formula: see text] for each social media platform. Moreover, we identify information spreading from questionable sources, finding different volumes of misinformation in each platform. However, information from both reliable and questionable sources do not present different spreading patterns. Finally, we provide platform-dependent numerical estimates of rumors' amplification."],
 ["We address the diffusion of information about the COVID-19 with a massive data analysis on Twitter, Instagram, YouTube, Reddit and Gab. We analyze engagement and interest in the COVID-19 topic and p


## Putting all together

So far, we've built a Faiss index using the misinformation abstract vectors we encoded with a sentence-DistilBERT model. That's helpful but in a real case scenario, we would have to work with unseen data. To query the index with an unseen query and retrieve its most relevant documents, we would have to do the following:

1. Encode the query with the same sentence-DistilBERT model we used for the rest of the abstract vectors.
2. Change its data type to float32.
3. Search the index with the encoded query.

Here, we will use the introduction of an article published on [HKS Misinformation Review](https://misinforeview.hks.harvard.edu/article/can-whatsapp-benefit-from-debunked-fact-checked-stories-to-reduce-misinformation/).


In [14]:
user_query = """
WhatsApp was alleged to have been widely used to spread misinformation and propaganda
during the 2018 elections in Brazil and the 2019 elections in India. Due to the
private encrypted nature of the messages on WhatsApp, it is hard to track the dissemination
of misinformation at scale. In this work, using public WhatsApp data from Brazil and India, we
observe that misinformation has been largely shared on WhatsApp public groups even after they
were already fact-checked by popular fact-checking agencies. This represents a significant portion
of misinformation spread in both Brazil and India in the groups analyzed. We posit that such
misinformation content could be prevented if WhatsApp had a means to flag already fact-checked
content. To this end, we propose an architecture that could be implemented by WhatsApp to counter
such misinformation. Our proposal respects the current end-to-end encryption architecture on WhatsApp,
thus protecting users’ privacy while providing an approach to detect the misinformation that benefits
from fact-checking efforts.
"""

In [15]:
# For convenience, I've wrapped all steps in the vector_search function.
# It takes four arguments:
# A query, the sentence-level transformer, the Faiss index and the number of requested results
D, I = vector_search([user_query], model, index, num_results=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

L2 distance: [7.636600017547607, 58.32740783691406, 58.32740783691406, 70.91803741455078, 73.32894897460938, 81.48760986328125, 85.36543273925781, 85.85227966308594, 87.20014190673828, 92.07547760009766]

MAG paper IDs: [3047438096, 3037966274, 3021927925, 2889959140, 2791045616, 2943077655, 2990343632, 2974128076, 3014380170, 3028584171]


In [16]:
# Fetching the paper titles based on their index
id2details(df, I, 'original_title')

[['Can WhatsApp Benefit from Debunked Fact-Checked Stories to Reduce Misinformation?'],
 ['A Dataset of Fact-Checked Images Shared on WhatsApp During the Brazilian and Indian Elections'],
 ['A Dataset of Fact-Checked Images Shared on WhatsApp During the Brazilian and Indian Elections'],
 ['A System for Monitoring Public Political Groups in WhatsApp'],
 ['Politics of Fake News: How WhatsApp Became a Potent Propaganda Tool in India'],
 ['Characterizing Attention Cascades in WhatsApp Groups'],
 ['Can WhatsApp Counter Misinformation by Limiting Message Forwarding'],
 ['Can WhatsApp Counter Misinformation by Limiting Message Forwarding'],
 ['OS IMPACTOS JURÍDICOS E SOCIAIS DAS FAKE NEWS EM TERRITÓRIO BRASILEIRO'],
 ['Images and Misinformation in Political Groups: Evidence from WhatsApp in India']]

In [23]:



# Define the file path using raw string literal
file_path = r"C:\Users\LEGION\Desktop\Semantic-Search-Engine-With-Transformers-and-Faiss\models\faiss_p.pickle"

with open(file_path, "wb") as h:
    # Serialize the index
    serialized_index = faiss.serialize_index(index)
    
    # Convert the serialized index to bytes
    data_to_pickle = bytes(serialized_index)
    
    # Save the serialized index to the pickle file
    pickle.dump(data_to_pickle, h)



MODEL EVALUATION
