## Imports and data processing

In [1]:
import spacy
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("claim_reviews_en.csv")

In [3]:
df.head()

Unnamed: 0,claim_text,label,review_url,fact_checker,appearances,reviews
0,"Ratan Tata Says, If The Death Of 65 Soldiers I...",not_credible,https://www.boomlive.in/photoshopped-tweet-cla...,"{'name': 'BOOM', 'country': 'India', 'language...",[],"[{'label': 'not_credible', 'original_label': '..."
1,The document that circulated did not come from...,not_credible,https://www.rappler.com/newsbreak/fact-check/2...,"{'name': 'Rappler', 'country': 'Philippines', ...",[],"[{'label': 'not_credible', 'original_label': '..."
2,News Outlets Did Not Report On Coronavirus Cas...,not_credible,https://leadstories.com/hoax-alert/2020/04/fac...,"{'name': 'Lead Stories', 'country': 'United St...",[],"[{'label': 'not_credible', 'original_label': '..."
3,RSS built the 6000-bed COVID centre in Indore,not_credible,https://www.altnews.in/rss-hasnt-build-second-...,"{'name': 'Pravda Media Foundation', 'country':...",[],"[{'label': 'not_credible', 'original_label': '..."
4,"La justice belge a suspendu Covid Safe Ticket,...",not_verifiable,https://www.20minutes.fr/societe/3202587-20211...,"{'name': '20 Minutes Fake off', 'country': 'Fr...",[],"[{'label': 'not_verifiable', 'original_label':..."


In [135]:
df.shape

(47108, 6)

In [4]:
sentences = df["claim_text"].tolist()

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
docs = []

for sentence in sentences:
    docs.append(nlp(sentence))

In [8]:
postags = []

for doc in docs:
    pos = []
    for el in doc:
        pos.append(el.pos_)
    postags.append(pos)

In [9]:
postags_set = set()

for sent in postags:
    postags_set.update(sent)

In [10]:
postag_mapping = {}

for i, pos in enumerate(postags_set):
    postag_mapping[pos] = i

In [11]:
mapped_postags = [[postag_mapping[pos] for pos in sent] for sent in postags]

## Dynamic time warping 

### Initial dtw where the cost=0 if the postags are the same and cost=1 otherwise

In [12]:
def dtw(s, t, costs):
    n, m = len(s), len(t)
    dtw_matrix = np.zeros((n+1, m+1))
    for i in range(n+1):
        for j in range(m+1):
            dtw_matrix[i, j] = np.inf
    dtw_matrix[0, 0] = 0
    
    for i in range(1, n+1):
        for j in range(1, m+1):
            val1, val2 = s[i-1], t[j-1]
            cost = costs[val1][val2]
            last_min = np.min([dtw_matrix[i-1, j], dtw_matrix[i, j-1], dtw_matrix[i-1, j-1]])
            dtw_matrix[i, j] = cost + last_min
            
    return dtw_matrix

In [13]:
costs = {}

for val in postag_mapping.values():
    costs[val] = {}
    for other_val in postag_mapping.values():
        if other_val == val:
            costs[val][other_val] = 0
        else:
            costs[val][other_val] = 1

In [14]:
dtw(mapped_postags[0], mapped_postags[1], costs)[-1, -1]

18.0

In [15]:
print(mapped_postags[0])

[2, 2, 3, 17, 0, 14, 1, 10, 7, 2, 3, 14, 2, 10, 14, 2, 2, 17, 4, 9, 5, 3, 14, 1]


In [16]:
distances_with_sample = []
mapped_sample = mapped_postags[0]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample, sent, costs)[-1, -1]
    distances_with_sample.append((i, distance))

In [17]:
sorted(distances_with_sample, key=lambda x: x[1])[:5]

[(0, 0.0), (4704, 9.0), (9278, 10.0), (17346, 10.0), (18402, 10.0)]

In [18]:
print(postags[0])

['PROPN', 'PROPN', 'VERB', 'PUNCT', 'SCONJ', 'DET', 'NOUN', 'ADP', 'NUM', 'PROPN', 'VERB', 'DET', 'PROPN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'DET', 'NOUN']


In [19]:
sentences[0]

'Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country'

In [20]:
print(postags[4704])

['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT', 'AUX', 'AUX', 'VERB', 'ADP', 'NOUN']


In [21]:
sentences[4704]

'Gen. Mark Milley, the chairman of the Joint Chiefs of Staff, has been arrested for treason'

In [70]:
costs_base = {}

for val in postag_mapping.values():
    costs_base[val] = {}
    for other_val in postag_mapping.values():
        if other_val == val:
            costs_base[val][other_val] = 0
        else:
            costs_base[val][other_val] = 1

Example 1

In [71]:
start = time.time()

distances_with_sample_updated_base = []
mapped_sample_base = mapped_postags[785]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_base, sent, costs_base)[-1, -1]
    distances_with_sample_updated_base.append((i, distance))
    
end = time.time()
print(end - start)

177.9184534549713


In [72]:
sorted(distances_with_sample_updated_base, key=lambda x: x[1])[:5]

[(785, 0.0), (33161, 0.0), (14249, 8.0), (6172, 10.0), (31890, 10.0)]

In [73]:
sentences[785]

'"In the first two years and a couple months (as California treasurer), I have saved taxpayers and ratepayers over $5.2 billion."'

In [74]:
print(postags[785])

['PUNCT', 'ADP', 'DET', 'ADJ', 'NUM', 'NOUN', 'CCONJ', 'DET', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'PROPN', 'NOUN', 'PUNCT', 'PUNCT', 'PRON', 'AUX', 'VERB', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'SYM', 'NUM', 'NUM', 'PUNCT', 'PUNCT']


In [78]:
sentences[14249]

'“Within the first two years [of taking office], we increased internally generated revenue by close to N15 billion.”'

In [77]:
print(postags[14249])

['PUNCT', 'ADP', 'DET', 'ADJ', 'NUM', 'NOUN', 'PUNCT', 'ADP', 'VERB', 'NOUN', 'PUNCT', 'PUNCT', 'PRON', 'VERB', 'ADV', 'VERB', 'NOUN', 'ADP', 'ADJ', 'ADP', 'PROPN', 'NUM', 'PUNCT', 'PUNCT']


In [101]:
sentences[31890]

'"For every extra year a girl goes to school, her income goes up 12 percent."'

In [102]:
print(postags[31890])

['PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'DET', 'NOUN', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'PRON', 'NOUN', 'VERB', 'ADV', 'NUM', 'NOUN', 'PUNCT', 'PUNCT']


Example 2

In [79]:
start = time.time()

distances_with_sample_2_base = []
mapped_sample_2_base = mapped_postags[1220]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_2_base, sent, costs_base)[-1, -1]
    distances_with_sample_2_base.append((i, distance))
    
end = time.time()
print(end - start)

127.90170860290527


In [80]:
sorted(distances_with_sample_2_base, key=lambda x: x[1])[:5]

[(1220, 0.0), (30095, 4.0), (4704, 5.0), (27058, 5.0), (29073, 5.0)]

In [81]:
sentences[1220]

'Jemima Goldsmith, the former wife of Pakistani Prime Minister Imran Khan, has given birth to a baby girl'

In [82]:
print(postags[1220])

['PROPN', 'PROPN', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'VERB', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN']


In [85]:
sentences[30095]

'Susan Rice, the director of the White House Domestic Policy Council, has been convicted of treason'

In [86]:
print(postags[30095])

['PROPN', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'AUX', 'VERB', 'ADP', 'NOUN']


In [105]:
sentences[4704]

'Gen. Mark Milley, the chairman of the Joint Chiefs of Staff, has been arrested for treason'

In [106]:
print(postags[4704])

['PROPN', 'PROPN', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT', 'AUX', 'AUX', 'VERB', 'ADP', 'NOUN']


In [107]:
sentences[27058]

'Nigeria is the highest producer of HIV-infected babies'

In [108]:
print(postags[27058])

['PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'VERB', 'NOUN']


Example 3.

In [87]:
start = time.time()

distances_with_sample_3_base = []
mapped_sample_3_base = mapped_postags[23170]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_3_base, sent, costs_base)[-1, -1]
    distances_with_sample_3_base.append((i, distance))
    
end = time.time()
print(end - start)

56.74076056480408


In [88]:
sorted(distances_with_sample_3_base, key=lambda x: x[1])[:5]

[(23170, 0.0), (2489, 2.0), (3096, 2.0), (13708, 2.0), (13947, 2.0)]

In [89]:
sentences[23170]

'People experiencing coronavirus symptoms in Ontario should call 811'

In [90]:
print(postags[23170])

['NOUN', 'VERB', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'AUX', 'VERB', 'NUM']


In [93]:
sentences[2489]

'Video shows crackdown in Uganda following 2021 election'

In [94]:
print(postags[2489])

['NOUN', 'VERB', 'NOUN', 'ADP', 'PROPN', 'VERB', 'NUM', 'NOUN']


In [113]:
sentences[13708]

'Photo shows caged dogs in China in June 2020'

In [114]:
print(postags[13708])

['NOUN', 'NOUN', 'VERB', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'NUM']


In [115]:
sentences[13947]

'Video shows fire in Shanghai in July 2020'

In [116]:
print(postags[13947])

['NOUN', 'VERB', 'NOUN', 'ADP', 'PROPN', 'ADP', 'PROPN', 'NUM']


In [124]:
for i, val in enumerate(sorted(distances_with_sample_3_base, key=lambda x: x[1])):
    if val[0] == 4459:
        print(i)

302


Example 4.

In [95]:
start = time.time()

distances_with_sample_4_base = []
mapped_sample_4_base = mapped_postags[39429]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_4_base, sent, costs_base)[-1, -1]
    distances_with_sample_4_base.append((i, distance))
    
end = time.time()
print(end - start)

101.95079731941223


In [96]:
sorted(distances_with_sample_4_base, key=lambda x: x[1])[:5]

[(39429, 0.0), (42305, 4.0), (46292, 4.0), (1479, 5.0), (6954, 5.0)]

In [97]:
sentences[39429]

'Denzel Washington has left the Democrat Party and has turned his attention to help team Trump'

In [98]:
print(postags[39429])

['PROPN', 'PROPN', 'AUX', 'VERB', 'DET', 'PROPN', 'PROPN', 'CCONJ', 'AUX', 'VERB', 'PRON', 'NOUN', 'PART', 'VERB', 'NOUN', 'PROPN']


In [99]:
sentences[42305]

'Mouthwash can kill Coronavirus and should be used to treat COVID'

In [100]:
print(postags[42305])

['PROPN', 'AUX', 'VERB', 'PROPN', 'CCONJ', 'AUX', 'AUX', 'VERB', 'PART', 'VERB', 'NOUN']


In [125]:
for i, val in enumerate(sorted(distances_with_sample_4_base, key=lambda x: x[1])):
    if val[0] == 31531:
        print(i)

6


### Dtw with more specific costs determined based on similarity

#### Setting the costs

NOUN - PROPN - PRON (0.25) <br>
AUX - VERB (0.4) <br>
DET - ADP (0.5) <br>
PART - ADP, PART - ADV (0.5) <br>
SCONJ - CCONJ (0.25) <br>
SCONJ - CCONJ - PART - ADP - ADV (0.8)

In [22]:
costs[postag_mapping["NOUN"]][postag_mapping["PROPN"]] = 0.25
costs[postag_mapping["PROPN"]][postag_mapping["NOUN"]] = 0.25

costs[postag_mapping["PRON"]][postag_mapping["PROPN"]] = 0.25
costs[postag_mapping["PROPN"]][postag_mapping["PRON"]] = 0.25

costs[postag_mapping["NOUN"]][postag_mapping["PRON"]] = 0.25
costs[postag_mapping["PRON"]][postag_mapping["NOUN"]] = 0.25

In [23]:
costs[postag_mapping["AUX"]][postag_mapping["VERB"]] = 0.4
costs[postag_mapping["VERB"]][postag_mapping["AUX"]] = 0.4

In [24]:
costs[postag_mapping["DET"]][postag_mapping["ADP"]] = 0.5
costs[postag_mapping["ADP"]][postag_mapping["DET"]] = 0.5

In [25]:
costs[postag_mapping["PART"]][postag_mapping["ADP"]] = 0.5
costs[postag_mapping["ADP"]][postag_mapping["PART"]] = 0.5

costs[postag_mapping["PART"]][postag_mapping["ADV"]] = 0.5
costs[postag_mapping["ADV"]][postag_mapping["PART"]] = 0.5

In [26]:
costs[postag_mapping["SCONJ"]][postag_mapping["CCONJ"]] = 0.25
costs[postag_mapping["CCONJ"]][postag_mapping["SCONJ"]] = 0.25

In [27]:
costs[postag_mapping["SCONJ"]][postag_mapping["PART"]] = 0.8
costs[postag_mapping["PART"]][postag_mapping["SCONJ"]] = 0.8

costs[postag_mapping["SCONJ"]][postag_mapping["ADP"]] = 0.8
costs[postag_mapping["ADP"]][postag_mapping["SCONJ"]] = 0.8

costs[postag_mapping["SCONJ"]][postag_mapping["ADV"]] = 0.8
costs[postag_mapping["ADV"]][postag_mapping["SCONJ"]] = 0.8

In [28]:
costs[postag_mapping["CCONJ"]][postag_mapping["PART"]] = 0.8
costs[postag_mapping["PART"]][postag_mapping["CCONJ"]] = 0.8

costs[postag_mapping["CCONJ"]][postag_mapping["ADP"]] = 0.8
costs[postag_mapping["ADP"]][postag_mapping["CCONJ"]] = 0.8

costs[postag_mapping["CCONJ"]][postag_mapping["ADV"]] = 0.8
costs[postag_mapping["ADV"]][postag_mapping["CCONJ"]] = 0.8

In [29]:
costs[postag_mapping["ADP"]][postag_mapping["ADV"]] = 0.8
costs[postag_mapping["ADV"]][postag_mapping["ADP"]] = 0.8

#### Finding similar sentences

Example 1.

In [30]:
import time

In [31]:
start = time.time()

distances_with_sample_updated = []
mapped_sample = mapped_postags[785]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample, sent, costs)[-1, -1]
    distances_with_sample_updated.append((i, distance))
    
end = time.time()
print(end - start)

184.64067101478577


In [32]:
sorted(distances_with_sample_updated, key=lambda x: x[1])[:5]

[(785, 0.0), (33161, 0.0), (14249, 7.2), (45615, 9.1), (6172, 9.25)]

In [33]:
print(postags[785])

['PUNCT', 'ADP', 'DET', 'ADJ', 'NUM', 'NOUN', 'CCONJ', 'DET', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'PROPN', 'NOUN', 'PUNCT', 'PUNCT', 'PRON', 'AUX', 'VERB', 'NOUN', 'CCONJ', 'NOUN', 'ADP', 'SYM', 'NUM', 'NUM', 'PUNCT', 'PUNCT']


In [34]:
sentences[785]

'"In the first two years and a couple months (as California treasurer), I have saved taxpayers and ratepayers over $5.2 billion."'

In [35]:
print(postags[14249])

['PUNCT', 'ADP', 'DET', 'ADJ', 'NUM', 'NOUN', 'PUNCT', 'ADP', 'VERB', 'NOUN', 'PUNCT', 'PUNCT', 'PRON', 'VERB', 'ADV', 'VERB', 'NOUN', 'ADP', 'ADJ', 'ADP', 'PROPN', 'NUM', 'PUNCT', 'PUNCT']


In [36]:
sentences[14249]

'“Within the first two years [of taking office], we increased internally generated revenue by close to N15 billion.”'

In [103]:
print(postags[45615])

['ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'PROPN', 'AUX', 'VERB', 'ADP', 'ADP', 'NUM', 'PUNCT']


In [104]:
sentences[45615]

'In the past year, employment in the constituency of Aldridge-Brownhills has gone up by 88,000.'

Example 2.

In [38]:
start = time.time()

distances_with_sample_2 = []
mapped_sample_2 = mapped_postags[1220]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_2, sent, costs)[-1, -1]
    distances_with_sample_2.append((i, distance))
    
end = time.time()
print(end - start)

129.71403694152832


In [39]:
sorted(distances_with_sample_2, key=lambda x: x[1])[:5]

[(1220, 0.0), (30095, 3.5), (2063, 4.0), (20031, 4.15), (29073, 4.15)]

In [40]:
sentences[1220]

'Jemima Goldsmith, the former wife of Pakistani Prime Minister Imran Khan, has given birth to a baby girl'

In [41]:
print(postags[1220])

['PROPN', 'PROPN', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'ADP', 'ADJ', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'VERB', 'NOUN', 'ADP', 'DET', 'NOUN', 'NOUN']


In [42]:
sentences[30095]

'Susan Rice, the director of the White House Domestic Policy Council, has been convicted of treason'

In [43]:
print(postags[30095])

['PROPN', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'AUX', 'VERB', 'ADP', 'NOUN']


In [109]:
sentences[2063]

'Sobha Singh, the same person who testified against Bhagat Singh, was awarded Padma Shri by the Congress govt.'

In [110]:
print(postags[2063])

['PROPN', 'PROPN', 'PUNCT', 'DET', 'ADJ', 'NOUN', 'PRON', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'VERB', 'PROPN', 'PROPN', 'ADP', 'DET', 'PROPN', 'NOUN', 'PUNCT']


In [111]:
sentences[20031]

'Margaret Keenan, the first U.K. recipient of the Pfizer/BioNTech COVID-19 vaccine, is a crisis actor'

In [112]:
print(postags[20031])

['PROPN', 'PROPN', 'PUNCT', 'DET', 'ADJ', 'PROPN', 'NOUN', 'ADP', 'DET', 'PROPN', 'SYM', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'AUX', 'DET', 'NOUN', 'NOUN']


Example 3.

In [44]:
import random

In [45]:
random.choice(range(len(sentences)))

39682

In [46]:
start = time.time()

distances_with_sample_3 = []
mapped_sample_3 = mapped_postags[23170]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_3, sent, costs)[-1, -1]
    distances_with_sample_3.append((i, distance))
    
end = time.time()
print(end - start)

59.88342547416687


In [47]:
sorted(distances_with_sample_3, key=lambda x: x[1])[:5]

[(23170, 0.0), (2489, 1.4), (3096, 1.4), (12706, 1.65), (4459, 1.75)]

In [48]:
sentences[23170]

'People experiencing coronavirus symptoms in Ontario should call 811'

In [49]:
print(postags[23170])

['NOUN', 'VERB', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'AUX', 'VERB', 'NUM']


In [50]:
sentences[2489]

'Video shows crackdown in Uganda following 2021 election'

In [51]:
print(postags[2489])

['NOUN', 'VERB', 'NOUN', 'ADP', 'PROPN', 'VERB', 'NUM', 'NOUN']


In [117]:
sentences[12706]

'Video shows Hamas attack in Israel killed hundreds'

In [118]:
print(postags[12706])

['NOUN', 'VERB', 'PROPN', 'NOUN', 'ADP', 'PROPN', 'VERB', 'NOUN']


In [119]:
sentences[4459]

'Ghana Education Service confirms date schools in Ghana will reopen'

In [120]:
print(postags[4459])

['PROPN', 'PROPN', 'PROPN', 'VERB', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'AUX', 'VERB']


Example 4.

In [93]:
random.choice(range(len(sentences)))

39429

In [52]:
start = time.time()

distances_with_sample_4 = []
mapped_sample_4 = mapped_postags[39429]

for i, sent in enumerate(mapped_postags):
    distance = dtw(mapped_sample_4, sent, costs)[-1, -1]
    distances_with_sample_4.append((i, distance))
    
end = time.time()
print(end - start)

105.72882032394409


In [53]:
sorted(distances_with_sample_4, key=lambda x: x[1])[:5]

[(39429, 0.0), (31531, 2.1), (2786, 2.4), (15614, 3.05), (34778, 3.05)]

In [54]:
sentences[39429]

'Denzel Washington has left the Democrat Party and has turned his attention to help team Trump'

In [55]:
print(postags[39429])

['PROPN', 'PROPN', 'AUX', 'VERB', 'DET', 'PROPN', 'PROPN', 'CCONJ', 'AUX', 'VERB', 'PRON', 'NOUN', 'PART', 'VERB', 'NOUN', 'PROPN']


In [56]:
sentences[31531]

'Greta Thunberg asked the China to stop using chopsticks to save trees'

In [57]:
print(postags[31531])

['PROPN', 'PROPN', 'VERB', 'DET', 'PROPN', 'PART', 'VERB', 'VERB', 'NOUN', 'PART', 'VERB', 'NOUN']


#### Finding similar sentences for 100 random samples

In [60]:
random_idx = random.sample(range(len(mapped_postags)), 100)

In [61]:
with open('random_indices_dtw.txt', 'w') as file:
    file.write('\n'.join(str(idx) for idx in random_idx))

In [62]:
start = time.time()

similarities = {}

for idx in random_idx:
    mapped_sent = mapped_postags[idx]
    dists = []

    for i, sent in enumerate(mapped_postags):
        distance = dtw(mapped_sent, sent, costs)[-1, -1]
        dists.append((i, distance))
    
    similarities[idx] = dists

end = time.time()
print(end - start)

11512.729236364365


In [63]:
import pickle

In [64]:
with open('random_100_dtw_results.pickle', 'wb') as f:
    pickle.dump(similarities, f)
        
#with open('saved_dictionary.pkl', 'rb') as f:
#    loaded_dict = pickle.load(f)

In [69]:
for key, val in similarities.items():
    sorted_sim = sorted(val, key=lambda x: x[1])
    print("ORIGINAL SENTENCE:")
    print("\t", sentences[key])
    print("\t", postags[key])
    for i in range(1, 4):
        print("SIMILAR SENTENCE NR", i)
        print("\t", sentences[sorted_sim[i][0]])
        print("\t", postags[sorted_sim[i][0]])
    print("--------------------------")

ORIGINAL SENTENCE:
	 Aspiring astronaut Alyssa Carson is selected for a one way mission to Mars
	 ['VERB', 'NOUN', 'PROPN', 'PROPN', 'AUX', 'VERB', 'ADP', 'DET', 'NUM', 'NOUN', 'NOUN', 'ADP', 'PROPN']
SIMILAR SENTENCE NR 1
	 Rahul Gandhi Is Married With Two Children In London
	 ['PROPN', 'PROPN', 'AUX', 'VERB', 'ADP', 'NUM', 'NOUN', 'ADP', 'PROPN']
SIMILAR SENTENCE NR 2
	 Barack Obama is running for president in Kenya
	 ['VERB', 'PROPN', 'AUX', 'VERB', 'ADP', 'NOUN', 'ADP', 'PROPN']
SIMILAR SENTENCE NR 3
	 Congress is contesting for 230 seats in Lok Sabha elections
	 ['PROPN', 'AUX', 'VERB', 'ADP', 'NUM', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'NOUN']
--------------------------
ORIGINAL SENTENCE:
	 Trump says he has "recognized Russian Meddling MANY TIMES."
	 ['PROPN', 'VERB', 'PRON', 'AUX', 'PUNCT', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 1
	 Gov. DeSantis Tells Biden: "Go Fuck Yourself"
	 ['PROPN', 'PROPN', 'VERB', 'PROPN', 'PUNCT', 'PUNCT', 'VERB'

ORIGINAL SENTENCE:
	 Viral video shows Bharuch police arresting man accused in Delhi riots case
	 ['ADJ', 'NOUN', 'VERB', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'ADP', 'PROPN', 'NOUN', 'NOUN']
SIMILAR SENTENCE NR 1
	 Genuine news report states former Philippine senator Mar Roxas arrested for drug possession
	 ['ADJ', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'ADJ', 'NOUN', 'PROPN', 'PROPN', 'VERB', 'ADP', 'NOUN', 'NOUN']
SIMILAR SENTENCE NR 2
	 Chinese soldiers helping Indian soldiers injured in avalanche
	 ['ADJ', 'NOUN', 'VERB', 'ADJ', 'NOUN', 'VERB', 'ADP', 'PROPN']
SIMILAR SENTENCE NR 3
	 Genuine news report says Australian scientists found bananas prevent COVID-19
	 ['ADJ', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'VERB', 'PROPN']
--------------------------
ORIGINAL SENTENCE:
	 “Queen Elizabeth removes Obamas from royal wedding guest list.”
	 ['PUNCT', 'PROPN', 'PROPN', 'VERB', 'PROPN', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 1
	 “Queen Eliza

ORIGINAL SENTENCE:
	 Forest fires are caused by poor management, not by climate change
	 ['NOUN', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'PART', 'ADP', 'NOUN', 'NOUN']
SIMILAR SENTENCE NR 1
	 Herpes is caused by parasitic worms and not virus
	 ['NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'NOUN', 'CCONJ', 'PART', 'NOUN']
SIMILAR SENTENCE NR 2
	 Train Was Marked With "COVID-19" On Its Side
	 ['NOUN', 'AUX', 'VERB', 'ADP', 'PUNCT', 'NOUN', 'PUNCT', 'ADP', 'PRON', 'NOUN']
SIMILAR SENTENCE NR 3
	 Train Was Marked With "COVID-19" On Its Side
	 ['NOUN', 'AUX', 'VERB', 'ADP', 'PUNCT', 'NOUN', 'PUNCT', 'ADP', 'PRON', 'NOUN']
--------------------------
ORIGINAL SENTENCE:
	 "Syria and Nicaragua are the only nations that didn't sign the Paris Agreement. Nicaragua said it wasn't tough enough."
	 ['PUNCT', 'PROPN', 'CCONJ', 'PROPN', 'AUX', 'DET', 'ADJ', 'NOUN', 'PRON', 'AUX', 'PART', 'VERB', 'DET', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'VERB', 'PRON', 'AUX', 'PART', 'ADJ', 'ADV', 'PUNCT', 'PUN

ORIGINAL SENTENCE:
	 On whether NATO is "obsolete."
	 ['ADP', 'SCONJ', 'PROPN', 'AUX', 'PUNCT', 'ADJ', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 1
	 On whether NATO is "obsolete."
	 ['ADP', 'SCONJ', 'PROPN', 'AUX', 'PUNCT', 'ADJ', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 2
	 On whether NATO is "obsolete."
	 ['ADP', 'SCONJ', 'PROPN', 'AUX', 'PUNCT', 'ADJ', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 3
	 The mRNA Covid-19 vaccines are experimental.
	 ['DET', 'PROPN', 'PROPN', 'NOUN', 'AUX', 'ADJ', 'PUNCT']
--------------------------
ORIGINAL SENTENCE:
	 The high-dose Fluzone vaccine caused the deaths of 23 senior citizens during a clinical trial.
	 ['DET', 'ADV', 'PUNCT', 'NOUN', 'PROPN', 'NOUN', 'VERB', 'DET', 'NOUN', 'ADP', 'NUM', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
SIMILAR SENTENCE NR 1
	 The BBC altered the headline on the front page of a Scottish newspaper.
	 ['DET', 'PROPN', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
SIMILAR SENTEN

ORIGINAL SENTENCE:
	 Bill and Hillary Clinton were kidnapped and murdered by ISIS
	 ['PROPN', 'CCONJ', 'PROPN', 'PROPN', 'AUX', 'VERB', 'CCONJ', 'VERB', 'ADP', 'PROPN']
SIMILAR SENTENCE NR 1
	 Bill and Hillary Clinton were kidnapped and murdered by ISIS
	 ['PROPN', 'CCONJ', 'PROPN', 'PROPN', 'AUX', 'VERB', 'CCONJ', 'VERB', 'ADP', 'PROPN']
SIMILAR SENTENCE NR 2
	 Ramayana and Mahabharata will be taught in Saudi Arabia
	 ['PROPN', 'CCONJ', 'PROPN', 'AUX', 'AUX', 'VERB', 'ADP', 'PROPN', 'PROPN']
SIMILAR SENTENCE NR 3
	 Cow Dung and Cow Urine can cure and prevent COVID-19
	 ['PROPN', 'PROPN', 'CCONJ', 'PROPN', 'PROPN', 'AUX', 'VERB', 'CCONJ', 'VERB', 'NOUN']
--------------------------
ORIGINAL SENTENCE:
	 Says "immigrants start businesses at a faster rate; they seem to grow those businesses more successfully."
	 ['VERB', 'PUNCT', 'NOUN', 'VERB', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'PART', 'VERB', 'DET', 'NOUN', 'ADV', 'ADV', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 1

ORIGINAL SENTENCE:
	 Fit people are immune to novel coronavirus
	 ['ADJ', 'NOUN', 'AUX', 'ADJ', 'ADP', 'ADJ', 'PROPN']
SIMILAR SENTENCE NR 1
	 Muslim women being harassed in Indian Kashmir
	 ['ADJ', 'NOUN', 'AUX', 'VERB', 'ADP', 'ADJ', 'PROPN']
SIMILAR SENTENCE NR 2
	 Black people are more resistant to novel coronavirus
	 ['ADJ', 'NOUN', 'AUX', 'ADV', 'ADJ', 'ADP', 'ADJ', 'PROPN']
SIMILAR SENTENCE NR 3
	 Cannabis possession still illegal for Canadian minors
	 ['ADJ', 'NOUN', 'ADV', 'ADJ', 'ADP', 'ADJ', 'NOUN']
--------------------------
ORIGINAL SENTENCE:
	 Philippine youth leaders wearing communist badges
	 ['ADJ', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']
SIMILAR SENTENCE NR 1
	 Indonesian president ate roasted rats
	 ['ADJ', 'NOUN', 'VERB', 'ADJ', 'NOUN']
SIMILAR SENTENCE NR 2
	 Philippine youth leaders wearing communist badges
	 ['ADJ', 'NOUN', 'NOUN', 'VERB', 'ADJ', 'NOUN']
SIMILAR SENTENCE NR 3
	 Hindu nationalists beat Muslim woman
	 ['ADJ', 'NOUN', 'VERB', 'ADJ', 'NOUN']
---------

ORIGINAL SENTENCE:
	 John Hagee calls for "prosecuting women who say God’s name during intercourse."
	 ['PROPN', 'PROPN', 'VERB', 'ADP', 'PUNCT', 'VERB', 'NOUN', 'PRON', 'VERB', 'PROPN', 'PART', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 1
	 John Hagee calls for "prosecuting women who say God’s name during intercourse."
	 ['PROPN', 'PROPN', 'VERB', 'ADP', 'PUNCT', 'VERB', 'NOUN', 'PRON', 'VERB', 'PROPN', 'PART', 'NOUN', 'ADP', 'NOUN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 2
	 U.S. President Joe Biden kneels down to beg George Floyd's son for forgiveness.
	 ['PROPN', 'PROPN', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PART', 'VERB', 'PROPN', 'PROPN', 'PART', 'NOUN', 'ADP', 'NOUN', 'PUNCT']
SIMILAR SENTENCE NR 3
	 US honours student made comments in support of school dress codes?
	 ['PROPN', 'VERB', 'NOUN', 'VERB', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'NOUN', 'PUNCT']
--------------------------
ORIGINAL SENTENCE:
	 A photograph shows a pin in the collar of a Chinese off

ORIGINAL SENTENCE:
	 "There is no reason for anybody to be losing any of their current benefits under Medicaid."
	 ['PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'PART', 'AUX', 'VERB', 'PRON', 'ADP', 'PRON', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 1
	 "There is no reason for anybody to be losing any of their current benefits under Medicaid."
	 ['PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'SCONJ', 'PRON', 'PART', 'AUX', 'VERB', 'PRON', 'ADP', 'PRON', 'ADJ', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 2
	 ‘It makes no sense for oil to be cheaper in Nigeria than in Saudi Arabia.’
	 ['PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'SCONJ', 'NOUN', 'PART', 'AUX', 'ADJ', 'ADP', 'PROPN', 'ADP', 'ADP', 'PROPN', 'PROPN', 'PUNCT', 'PUNCT']
SIMILAR SENTENCE NR 3
	 "Canada legalizes euthanasia for parents to kill their disabled kids."
	 ['PUNCT', 'PROPN', 'VERB', 'NOUN', 'SCONJ', 'NOUN', 'PART', 'VERB', 'PRON', 'ADJ', 'NOUN', 'PUNCT', 'PUNCT']
--------

In [126]:
len(similarities)

100

In [127]:
list(similarities.keys())[:3]

[46304, 26590, 34527]

In [128]:
len(similarities[46304])

47108

In [129]:
import pickle

In [130]:
with open("pos_similarities.pickle", "wb") as f:
    pickle.dump(similarities, f)

In [131]:
samples = list(similarities.keys())

In [132]:
samples[:3]

[46304, 26590, 34527]

In [133]:
with open("pos_similarities_examples.pickle", "wb") as f:
    pickle.dump(samples, f)

In [140]:
len(postags[46304])

13

In [145]:
len(postags[21062])

9

In [144]:
sorted(similarities[46304], key=lambda x: x[1])[:5]

[(46304, 0.0), (21062, 1.75), (32987, 1.75), (23101, 2.0), (41616, 2.15)]

In [146]:
similarities[46304][:10]

[(0, 13.45),
 (1, 5.4),
 (2, 7.25),
 (3, 3.15),
 (4, 7.75),
 (5, 7.15),
 (6, 8.25),
 (7, 33.4),
 (8, 17.15),
 (9, 8.65)]

In [151]:
pd.DataFrame(similarities)[46304].str[1]

0        13.45
1         5.40
2         7.25
3         3.15
4         7.75
         ...  
47103     8.40
47104     9.80
47105    20.40
47106    10.75
47107     7.40
Name: 46304, Length: 47108, dtype: float64