### Without tweets

In [1]:
import json
import pickle
import spacy
import pandas as pd
import networkx as nx
import spacy_dbpedia_spotlight

In [2]:
with open("sentences_with_entities_7548.pickle", "rb") as f:
    docs1 = pickle.load(f)

In [3]:
docs1[0]

Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country

In [4]:
with open("sentences_with_entities_7548_to_end.pickle", "rb") as f:
    docs2 = pickle.load(f)

In [5]:
docs2[0]

(7549,
 A Facebook post purportedly shows a graphic from the anti-gun violence group Moms Demand Action.)

In [6]:
docs2_docs = [doc[1] for doc in docs2]

In [7]:
docs2_docs[0]

A Facebook post purportedly shows a graphic from the anti-gun violence group Moms Demand Action.

In [8]:
docs = docs1 + docs2_docs

In [9]:
docs[0].ents

(Ratan Tata, Prime Minister, This Country)

In [10]:
docs[0].text

'Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country'

In [11]:
len(docs)

45875

In [12]:
sentences = [doc.text for doc in docs]

In [13]:
entities = []
edges = []

for i, sent in enumerate(docs):
    for ent in sent.ents:
        entities.append(ent.text)
        edges.append((sent.text, ent.text))

In [14]:
entities[:3]

['Ratan Tata', 'Prime Minister', 'This Country']

In [15]:
len(sentences)

45875

In [16]:
len(docs[0].ents)

3

In [17]:
edges[:5]

[('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
  'Ratan Tata'),
 ('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
  'Prime Minister'),
 ('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
  'This Country'),
 ('The document that circulated did not come from the Civil Service Commission',
  'Civil Service Commission'),
 ('News Outlets Did Not Report On Coronavirus Cases At Walmart, Amazon And Other Retailers',
  'Coronavirus')]

In [18]:
G = nx.Graph()

In [19]:
G.add_nodes_from(sentences)
G.add_nodes_from(entities)

In [20]:
G.add_edges_from(edges)

In [80]:
mapping = {}

for sent in sentences:
    mapping[sent] = {"type": "sentence"}
    
for ent in entities:
    mapping[ent] = {"type": "entity"}

In [28]:
nx.set_node_attributes(G, mapping)

In [21]:
from node2vec import Node2Vec

In [22]:
node2vec = Node2Vec(G, dimensions=64, walk_length=10, num_walks=20, workers=1)  # Use temp_folder for big graphs

Computing transition probabilities:   0%|          | 0/59809 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 20/20 [02:21<00:00,  7.07s/it]


In [23]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [24]:
model.wv.most_similar('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country')

[('This Country', 0.9196077585220337),
 ('Ratan Tata said that 2020 is a year to survive for businesses and they must not care about profit or loss',
  0.8237733840942383),
 ('Ratan Tata', 0.820371150970459),
 ('Viral message attributed to Ratan Tata', 0.815273642539978),
 ('"Singer Celine Dion Says: ‘I Can’t Even Look At An American Flag Any More, I Hate This Country.’"',
  0.8122323155403137),
 ('A post claiming that industrialist Ratan Tata had made a statement that he doesn’t ‘believe in taking the right decisions but he takes decisions and then make them right’',
  0.8116405606269836),
 ('Ratan Tata has called for the selling of alcohol through Aadhaar cards',
  0.8080866932868958),
 ('Ratan Tata has claimed experts have predicted the economic downfall, but we will overcome this',
  0.804827094078064),
 ('Jane Fonda said "I Swear That American Flag Is My House Carpet, I Hate Everything About This Country"',
  0.7920010089874268),
 ('Ratan Tata said, &quot;Government food subsidies

In [25]:
r = model.wv.most_similar('Photo of Prime Minister Narendra Modi holding a massive roadshow',topn=len(G.nodes))

In [29]:
r_sent = []

for bla in r:
    if G.nodes[bla[0]]["type"] == "sentence":
        r_sent.append(bla)

In [30]:
r_sent[:10]

[('Image Of A Poster Being Held By Two Men At A Protest Site Calling Prime Minister Narendra Modi A Thief',
  0.932871401309967),
 ('Prime Minister Narendra Modi met four different leaders in different clothes on the same day',
  0.9237555265426636),
 ('A photograph shows Prime Minister Narendra Modi waving an Islamic flag.',
  0.9234981536865234),
 ('Clip shows Prime Minister Narendra Modi evading a question on petrol prices',
  0.9232084155082703),
 ('O. Panneerselvam bowing before Prime Minister Narendra Modi',
  0.9231763482093811),
 ('Image Of Prime Minister Narendra Modi Eating Multiple Dishes',
  0.9138364791870117),
 ('Photo shoot of Narendra Modi with ducks.', 0.9137466549873352),
 ('Image shows Narendra Modis statue has been installed at Narendra Modi Chowk which is usually done after a persons death',
  0.9086052775382996),
 ('Indian Prime Minister Narendra Modi has passed away due to reasons yet undisclosed.',
  0.9073715806007385),
 ('Prime Minister Narendra Modi Wearing A

### With tweets

In [31]:
with open("preprocessed_tweets.pickle", "rb") as f:
    tweets = pickle.load(f)

In [55]:
with open("tweets_entities_spacy.json", "r") as f:
    tweet_entities = json.load(f)

In [35]:
tweets[:3]

['CDC does not currently recommend the use of facemasks to help prevent novel Take everyday preventive actions, like staying home when you are sick and washing hands with soap and water, to help slow the spread of respiratory illness.',
 'Election Fraud was inevitable. Democrats control media, big tech, corruption and manipulation.',
 'Donald Trump is the first president in modern history did not start a new war.']

In [36]:
tweet_entities[:3]

[[['Centers for Disease Control and Prevention',
   'https://www.wikidata.org/wiki/Q583725'],
  ['Use', 'https://www.wikidata.org/wiki/Q7901733'],
  ['Coronavirus', 'https://www.wikidata.org/wiki/Q290805'],
  ['novel', 'https://www.wikidata.org/wiki/Q8261'],
  ['hashtag', 'https://www.wikidata.org/wiki/Q278485'],
  ['Preventive action', 'https://www.wikidata.org/wiki/Q2823701'],
  ['washing', 'https://www.wikidata.org/wiki/Q23841'],
  ['hand', 'https://www.wikidata.org/wiki/Q33767'],
  ['soap opera', 'https://www.wikidata.org/wiki/Q23739'],
  ['water', 'https://www.wikidata.org/wiki/Q283'],
  ['spread', 'https://www.wikidata.org/wiki/Q1472481'],
  ['disease', 'https://www.wikidata.org/wiki/Q12136']],
 [['electoral fraud', 'https://www.wikidata.org/wiki/Q692209'],
  ['Democratic Party', 'https://www.wikidata.org/wiki/Q29552'],
  ['tunica media', 'https://www.wikidata.org/wiki/Q1740954'],
  ['Big Five', 'https://www.wikidata.org/wiki/Q65040888'],
  ['corruption', 'https://www.wikidata.or

In [40]:
claims = pd.read_csv("claim_reviews_en.csv")

In [38]:
with open("claims_entities_spacy.json", "r") as f:
    claims_entities = json.load(f)

In [39]:
claims_entities[:3]

[[['Ratan Tata', 'https://www.wikidata.org/wiki/Q333460'],
  ['Death', 'https://www.wikidata.org/wiki/Q161936'],
  ['Soldier', 'https://www.wikidata.org/wiki/Q1138936'],
  ['Popularity', 'https://www.wikidata.org/wiki/Q7229765'],
  ['Grand Chancellor', 'https://www.wikidata.org/wiki/Q1275294'],
  ['Dennis Rodman', 'https://www.wikidata.org/wiki/Q201608']],
 [['document', 'https://www.wikidata.org/wiki/Q49848'],
  ['Civil Service Commission', 'https://www.wikidata.org/wiki/Q13565023']],
 [['the media', 'https://www.wikidata.org/wiki/Q17502905'],
  ['Case Western Reserve University',
   'https://www.wikidata.org/wiki/Q1047060'],
  ['Walmart', 'https://www.wikidata.org/wiki/Q483551'],
  ['Amazon', 'https://www.wikidata.org/wiki/Q3884']]]

In [41]:
claims.head(3)

Unnamed: 0,claim_text,label,review_url,fact_checker,appearances,reviews
0,"Ratan Tata Says, If The Death Of 65 Soldiers I...",not_credible,https://www.boomlive.in/photoshopped-tweet-cla...,"{'name': 'BOOM', 'country': 'India', 'language...",[],"[{'label': 'not_credible', 'original_label': '..."
1,The document that circulated did not come from...,not_credible,https://www.rappler.com/newsbreak/fact-check/2...,"{'name': 'Rappler', 'country': 'Philippines', ...",[],"[{'label': 'not_credible', 'original_label': '..."
2,News Outlets Did Not Report On Coronavirus Cas...,not_credible,https://leadstories.com/hoax-alert/2020/04/fac...,"{'name': 'Lead Stories', 'country': 'United St...",[],"[{'label': 'not_credible', 'original_label': '..."


In [42]:
claims_text = claims['claim_text'].tolist()

In [43]:
claims_text[:3]

['Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
 'The document that circulated did not come from the Civil Service Commission',
 'News Outlets Did Not Report On Coronavirus Cases At Walmart, Amazon And Other Retailers']

In [48]:
type(tweet_entities[0][0][0])

str

In [49]:
tweets[0], tweet_entities[0]

('CDC does not currently recommend the use of facemasks to help prevent novel Take everyday preventive actions, like staying home when you are sick and washing hands with soap and water, to help slow the spread of respiratory illness.',
 [['Centers for Disease Control and Prevention',
   'https://www.wikidata.org/wiki/Q583725'],
  ['Use', 'https://www.wikidata.org/wiki/Q7901733'],
  ['Coronavirus', 'https://www.wikidata.org/wiki/Q290805'],
  ['novel', 'https://www.wikidata.org/wiki/Q8261'],
  ['hashtag', 'https://www.wikidata.org/wiki/Q278485'],
  ['Preventive action', 'https://www.wikidata.org/wiki/Q2823701'],
  ['washing', 'https://www.wikidata.org/wiki/Q23841'],
  ['hand', 'https://www.wikidata.org/wiki/Q33767'],
  ['soap opera', 'https://www.wikidata.org/wiki/Q23739'],
  ['water', 'https://www.wikidata.org/wiki/Q283'],
  ['spread', 'https://www.wikidata.org/wiki/Q1472481'],
  ['disease', 'https://www.wikidata.org/wiki/Q12136']])

In [58]:
tweet_edges = []
tweet_ents = []

for i, sent in enumerate(tweet_entities):
    for ent in sent:
        tweet_ents.append(ent[0])
        tweet_edges.append((tweets[i], ent[0]))

In [59]:
tweet_edges[:3]

[('CDC does not currently recommend the use of facemasks to help prevent novel Take everyday preventive actions, like staying home when you are sick and washing hands with soap and water, to help slow the spread of respiratory illness.',
  'Centers for Disease Control and Prevention'),
 ('CDC does not currently recommend the use of facemasks to help prevent novel Take everyday preventive actions, like staying home when you are sick and washing hands with soap and water, to help slow the spread of respiratory illness.',
  'Use'),
 ('CDC does not currently recommend the use of facemasks to help prevent novel Take everyday preventive actions, like staying home when you are sick and washing hands with soap and water, to help slow the spread of respiratory illness.',
  'Coronavirus')]

In [60]:
tweet_ents[:3]

['Centers for Disease Control and Prevention', 'Use', 'Coronavirus']

In [62]:
claim_edges = []
claim_ents = []

for i, sent in enumerate(claims_entities):
    for ent in sent:
        claim_ents.append(ent[0])
        claim_edges.append((claims_text[i], ent[0]))

In [63]:
claim_edges[:3]

[('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
  'Ratan Tata'),
 ('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
  'Death'),
 ('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country',
  'Soldier')]

In [64]:
claim_ents[:3]

['Ratan Tata', 'Death', 'Soldier']

In [66]:
combined_ents = list(set(claim_ents + tweet_ents))

In [67]:
len(combined_ents)

23806

In [70]:
combined_ents = [ent for ent in combined_ents if ent is not None]

In [71]:
len(combined_ents)

23805

In [72]:
G2 = nx.Graph()

In [73]:
G2.add_nodes_from(claims_text)
G2.add_nodes_from(tweets)
G2.add_nodes_from(combined_ents)

In [75]:
claim_edges = [edge for edge in claim_edges if edge[0] is not None and edge[1] is not None]
tweet_edges = [edge for edge in tweet_edges if edge[0] is not None and edge[1] is not None]

In [76]:
G2.add_edges_from(claim_edges)
G2.add_edges_from(tweet_edges)

In [78]:
mapping2 = {}

for claim in claims_text:
    mapping2[claim] = {"type": "claim"}
    
for tweet in tweets:
    mapping2[tweet] = {"type": "tweet"}
    
for ent in combined_ents:
    mapping2[ent] = {"type": "entity"}

In [79]:
nx.set_node_attributes(G2, mapping2)

In [81]:
node2vec2 = Node2Vec(G2, dimensions=64, walk_length=10, num_walks=20, workers=1)  # Use temp_folder for big graphs

Computing transition probabilities:   0%|          | 0/72948 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 20/20 [04:31<00:00, 13.60s/it]


In [82]:
model2 = node2vec2.fit(window=100, min_count=1, batch_words=16)

In [83]:
model2.wv.most_similar('Ratan Tata Says, If The Death Of 65 Soldiers Increases The Popularity Of A Prime Minister, Then Nobody Can Save This Country')

[('Popularity', 0.9513153433799744),
 ('Ratan Tata', 0.711036205291748),
 ('Soldier', 0.7054051160812378),
 ('Dennis Rodman', 0.6594432592391968),
 ('Punjabi Artist Impersonating As Indian Soldier', 0.6507119536399841),
 ('Death', 0.640994668006897),
 ('Grand Chancellor', 0.6386310458183289),
 ('Soldiers are paid N65,000 monthly', 0.6264833807945251),
 ('Stairs', 0.614574134349823),
 ('Deaths rose by 650 above average during the UK heatwave',
  0.6121979355812073)]

In [93]:
# r = model2.wv.most_similar('Photo of Prime Minister Narendra Modi holding a massive roadshow',topn=len(G2.nodes))
r = model2.wv.most_similar('Donald Trump is the first president in modern history did not start a new war.', topn=len(G2.nodes))

In [94]:
r_sent = []

for bla in r:
    if G2.nodes[bla[0]]["type"] == "claim" or G2.nodes[bla[0]]["type"] == "tweet":
        r_sent.append((bla[0], G2.nodes[bla[0]]["type"], bla[1]))

In [95]:
r_sent[:10]

[('Says Barack Obama "is the first president in modern history not to have a single year of 3 percent growth."',
  'claim',
  0.7230109572410583),
 ('Calus has falsified histories written by his Psions to make him happy',
  'tweet',
  0.7198453545570374),
 ('Says Donald Trump "has a long history of exporting jobs overseas."',
  'claim',
  0.7024901509284973),
 ('The Jamkaran mosque in Iran raised a red flag over its dome for the first time in history.',
  'claim',
  0.6885107159614563),
 ('"There’s more Hoosiers going to work than ever before."',
  'claim',
  0.6795364618301392),
 ('Says there’s "never (been) so many apprehensions ever in our history."',
  'claim',
  0.6733711957931519),
 ('Trump\'s "budget reduced the deficit by $3 trillion, which is one of the largest in history."',
  'claim',
  0.6515135169029236),
 ('"Today, there are more Hoosiers going to work than ever before in the 200-year history of the great state of Indiana."',
  'claim',
  0.6508753299713135),
 ('Jim Jorda