### Imports and data preparation

In [1]:
import spacy
import pandas as pd

In [2]:
df = pd.read_csv("claim_reviews_en.csv")

In [3]:
df.head()

Unnamed: 0,claim_text,label,review_url,fact_checker,appearances,reviews
0,"Ratan Tata Says, If The Death Of 65 Soldiers I...",not_credible,https://www.boomlive.in/photoshopped-tweet-cla...,"{'name': 'BOOM', 'country': 'India', 'language...",[],"[{'label': 'not_credible', 'original_label': '..."
1,The document that circulated did not come from...,not_credible,https://www.rappler.com/newsbreak/fact-check/2...,"{'name': 'Rappler', 'country': 'Philippines', ...",[],"[{'label': 'not_credible', 'original_label': '..."
2,News Outlets Did Not Report On Coronavirus Cas...,not_credible,https://leadstories.com/hoax-alert/2020/04/fac...,"{'name': 'Lead Stories', 'country': 'United St...",[],"[{'label': 'not_credible', 'original_label': '..."
3,RSS built the 6000-bed COVID centre in Indore,not_credible,https://www.altnews.in/rss-hasnt-build-second-...,"{'name': 'Pravda Media Foundation', 'country':...",[],"[{'label': 'not_credible', 'original_label': '..."
4,"La justice belge a suspendu Covid Safe Ticket,...",not_verifiable,https://www.20minutes.fr/societe/3202587-20211...,"{'name': '20 Minutes Fake off', 'country': 'Fr...",[],"[{'label': 'not_verifiable', 'original_label':..."


In [4]:
sentences = df["claim_text"].tolist()

In [5]:
sentences[:3]

['Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
 'The document that circulated did not come from the Civil Service Commission',
 'News Outlets Did Not Report On Coronavirus Cases At Walmart, Amazon And Other Retailers']

In [6]:
# !python -m spacy download en_core_web_sm

In [7]:
nlp = spacy.load("en_core_web_sm")

In [8]:
docs = []

for sentence in sentences:
    docs.append(nlp(sentence))

In [9]:
docs[0][0], docs[0][0].pos_

(Ratan, 'PROPN')

In [10]:
postags = []

for doc in docs:
    pos = []
    for el in doc:
        pos.append(el.pos_)
    postags.append(pos)

### N-grams base

In [11]:
from nltk.util import bigrams, trigrams



In [12]:
bigrams_ = []
trigrams_ = []

for pos in postags:
    bigrams_.append(list(bigrams(pos)))
    trigrams_.append(list(trigrams(pos)))

In [13]:
bigram_similarities = []
sample_sentence = bigrams_[0]

for i, sent in enumerate(bigrams_):
    same_bigrams = 0
    for bigram in sample_sentence:
        if bigram in sent:
            same_bigrams += 1
    bigram_similarities.append((same_bigrams, i))

In [14]:
bigram_similarities[:3]

[(23, 0), (7, 1), (4, 2)]

In [15]:
sorted(bigram_similarities, key=lambda x: x[0], reverse=True)[:5]

[(23, 0), (19, 12833), (19, 22958), (18, 2645), (18, 9840)]

In [16]:
trigram_similarities = []
sample_sentence = trigrams_[0]

for i, sent in enumerate(trigrams_):
    same_trigrams = 0
    for trigram in sample_sentence:
        if trigram in sent:
            same_trigrams += 1
    trigram_similarities.append((same_trigrams, i))

In [17]:
sorted(trigram_similarities, key=lambda x: x[0], reverse=True)[:5]

[(22, 0), (11, 19034), (10, 2645), (9, 704), (9, 5717)]

In [18]:
print(postags[0])

['PROPN', 'PROPN', 'VERB', 'PUNCT', 'SCONJ', 'DET', 'NOUN', 'ADP', 'NUM', 'PROPN', 'VERB', 'DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN']


In [19]:
print(postags[19034])

['DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'VERB', 'DET', 'PROPN', 'PROPN', 'VERB', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADJ', 'NOUN', 'PUNCT', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN', 'PUNCT']


### Dynamic time warping

In [20]:
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean, cosine

In [21]:
postags_set = set()

for sent in postags:
    postags_set.update(sent)

In [22]:
postag_mapping = {}

for i, pos in enumerate(postags_set):
    postag_mapping[pos] = i

In [23]:
mapped_postags = [[postag_mapping[pos] for pos in sent] for sent in postags]

#### Euclidean distance

In [24]:
distances_with_sample = []
distances_with_sample_path = []
mapped_sample = mapped_postags[0]

for i, sent in enumerate(mapped_postags):
    distance, path = fastdtw(mapped_sample, sent, dist=euclidean)
    distances_with_sample.append((i, distance))
    distances_with_sample_path.append((i, distance, path))

In [25]:
sorted(distances_with_sample, key=lambda x: x[1])[:5]

[(0, 0.0), (13860, 33.0), (10614, 34.0), (4704, 35.0), (43053, 35.0)]

In [26]:
print(postags[0])

['PROPN', 'PROPN', 'VERB', 'PUNCT', 'SCONJ', 'DET', 'NOUN', 'ADP', 'NUM', 'PROPN', 'VERB', 'DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN']


In [30]:
print(postags[13860])

['PROPN', 'PROPN', 'ADP', 'PROPN', 'VERB', 'ADP', 'DET', 'PROPN', 'ADP', 'PROPN', 'AUX', 'VERB']


In [28]:
sentences[0]

'Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country'

In [31]:
sentences[13860]

'Viral Video Of Woman Kidnapped By A Group Of Men Is Scripted'