## Module setup

In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 35.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 56.1 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 64.1 MB/s 
Building wheels for collected 

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

## Preparing the text

In [None]:
data = pd.read_csv("claim_reviews_en.csv")

In [None]:
data.head()

Unnamed: 0,claim_text,label,review_url,fact_checker,appearances,reviews
0,"Ratan Tata Says, If The Death Of 65 Soldiers I...",not_credible,https://www.boomlive.in/photoshopped-tweet-cla...,"{'name': 'BOOM', 'country': 'India', 'language...",[],"[{'label': 'not_credible', 'original_label': '..."
1,The document that circulated did not come from...,not_credible,https://www.rappler.com/newsbreak/fact-check/2...,"{'name': 'Rappler', 'country': 'Philippines', ...",[],"[{'label': 'not_credible', 'original_label': '..."
2,News Outlets Did Not Report On Coronavirus Cas...,not_credible,https://leadstories.com/hoax-alert/2020/04/fac...,"{'name': 'Lead Stories', 'country': 'United St...",[],"[{'label': 'not_credible', 'original_label': '..."
3,RSS built the 6000-bed COVID centre in Indore,not_credible,https://www.altnews.in/rss-hasnt-build-second-...,"{'name': 'Pravda Media Foundation', 'country':...",[],"[{'label': 'not_credible', 'original_label': '..."
4,"La justice belge a suspendu Covid Safe Ticket,...",not_verifiable,https://www.20minutes.fr/societe/3202587-20211...,"{'name': '20 Minutes Fake off', 'country': 'Fr...",[],"[{'label': 'not_verifiable', 'original_label':..."


In [None]:
sentences = data["claim_text"].to_list()

In [None]:
sentences[:3]

['Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
 'The document that circulated did not come from the Civil Service Commission',
 'News Outlets Did Not Report On Coronavirus Cases At Walmart, Amazon And Other Retailers']

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
sentence_embeddings = model.encode(sentences)

## Finding similar sentences

In [None]:
from sentence_transformers import util

In [None]:
cosine_scores = util.cos_sim(sentence_embeddings[0], sentence_embeddings)

In [None]:
cosines_with_sent_id = [(i, cos.item()) for i, cos in enumerate(cosine_scores[0])]

In [None]:
cosines_with_sent_id[:3]

[(0, 1.0), (1, 0.15425990521907806), (2, 0.04794611781835556)]

In [None]:
sorted(cosines_with_sent_id, key=lambda x: x[1], reverse=True)[:11]

[(0, 1.0),
 (3178, 0.53443443775177),
 (31020, 0.5286775827407837),
 (46979, 0.5169151425361633),
 (13911, 0.5046957731246948),
 (15586, 0.49588480591773987),
 (45001, 0.49483901262283325),
 (3083, 0.49281278252601624),
 (8794, 0.4900658428668976),
 (23207, 0.48910218477249146),
 (30045, 0.48526516556739807)]

In [None]:
data.at[0, "claim_text"]

'Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country'

In [None]:
data.iloc[[3178, 31020, 46979]]["claim_text"].tolist()

['WHO said India will see 50,000 deaths by April 15',
 '"India does not need Army, Navy, Air force," - Rahul Gandhi',
 '&quot;If our govt is voted out of power then I will set the whole country on fire&quot;, said Yogi Adityanath']