In [7]:
import sklearn.preprocessing as preprocess
import numpy as np
import pandas as pd
from keras import backend as K
import spacy
from spacy.lang.en import English
import pickle
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 1000)
from tqdm import tqdm_notebook

In [2]:
def load_spacy():
    sentencizer = English()
    sentencizer.add_pipe(sentencizer.create_pipe('sentencizer'))
    nlp = spacy.load("en_core_web_md")
    return sentencizer, nlp
sentencizer, nlp = load_spacy()

In [8]:
df_hd_tp = pd.read_json('evaluation_set/deepnofakes/Evaluation_Final_50_V4.json')
with open('evaluation_set/deepnofakes/dnf_300/cleaned/cleaned_dnf300_sent_array_id.p', 'rb') as fp:
    articles = pickle.load(fp)
with open('evaluation_set/deepnofakes/dnf_300/cleaned/cleaned_dnf300_sent_vector_array_id.p', 'rb') as fp:
    article_vectors = pickle.load(fp)
    
with open('evaluation_set/deepnofakes/dnf_700/dnf700_sent_array_id.p', 'rb') as fp:
    articles700 = pickle.load(fp)
with open('evaluation_set/deepnofakes/dnf_700/dnf700_sent_vector_array_id.p', 'rb') as fp:
    article_vectors700 = pickle.load(fp)
with open('evaluation_set/word_mapping/id_word_mapping.p', 'rb') as fp:
    id_word_mapping = pickle.load(fp)
df_hd_tp.keys()

Index(['authors', 'claim_ids', 'evidence', 'headline', 'id', 'reason',
       'sentences', 'type', 'urls'],
      dtype='object')

In [4]:
df_hd_tp[df_hd_tp.id==33].sentences.values[0]

['In a message to Donald Trump, Russian President Vladimir Putin has expressed confidence that the dialogue between Moscow and Washington, in keeping with each other’s views, meets the interests of both Russia and the US.',
 'The Russian leader noted in the message that he hopes to address some burning issues that are currently on the international agenda, and search for effective responses to the challenges of the global security, RIA Novosti reported.',
 'Putin has repeatedly noted that the worsening of Russia’s relations with the US was not our choice, however.',
 'For things to improve between Moscow and Washington, the US should first and foremost start acting like an equal partner and respect Russia’s interests rather than try to dictate terms, Putin said last month.']

# All unique headlines

In [5]:
display(df_hd_tp[['id','headline']].sort_values(by='id'))

Unnamed: 0,id,headline
0,0,WikiLeaks CONFIRMS Hillary Sold Weapons to ISIS... Then Drops Another BOMBSHELL! Breaking News
1,1,Hillary Clinton Wore Secret Earpiece During First Presidential Debate?
2,2,President Obama Confirms He Will Refuse to Leave Office If Trump Is Elected
3,3,BREAKING: Fraudulent Clinton Votes Discovered By The Tens Of Thousands
4,4,"FBI director received millions from Clinton Foundation, his brother’s law firm does Clinton’s taxes"
5,5,Hillary Clinton Wore 'Secret Earpiece' During Commander-in-Chief Forum
6,6,Clinton Received Debate Questions Week Before Debate
7,7,Hillary Clinton Used Hand Signals to Rig Debate?
8,8,Hillary Clinton Cut Her Tax Bill by 'Donating' $1 Million to Herself via the Clinton Foundation?
9,9,Obama Declares His Family Will Move to Canada If Trump Is Elected


# Evaluation code DNF 300

In [18]:

best_N = 5
threshold = 0.95

ps, rs = [],[]
for title in tqdm_notebook(df_hd_tp.headline.unique()):
    tp,fp,fn = 0,0,0
    hd = df_hd_tp[df_hd_tp.headline==title]['headline'].values[0]
    ar_id = df_hd_tp[df_hd_tp.headline==title]['id'].values[0]
    claims = df_hd_tp[df_hd_tp.headline==title]['claim_ids'].values[0]
    if ar_id != 6:
        continue
    if len(claims)<1: # since there are 3 to 5 claims
        continue
#     print('ar_id:',ar_id)
#     print('hd:',hd)
#     print('cl_1:',cl[0])
#     print(hd)
    sentences = articles[ar_id]
    tokens_hd = nlp(hd)
    sims = []
    for i in range(len(sentences)):
        tokens_s = nlp(sentences[i])
        sims.append(tokens_hd.similarity(tokens_s))
    
    pred = np.array(sims).argsort()[-best_N:][::-1]
    print(pred)
    print(claims)
    for idx in pred:
        print(sentences[idx],'sim:',np.around(sims[idx],4))
        print('----')
        
    
#     print(cl)
    
    
#     print('claims:',claims)
#     print('pred:',pred)
    for p in pred:
        if p in claims:
            tp+=1
        else:
            fp+=1
    for c in claims:
        if c not in pred:
            fn+=1
    p = tp/(tp+fp)
    r = tp/(tp+fn)
#     print()
#     tn = sentences - list(set(list(pred)+list(claims)))
#     print(test_idx,', article id:',x['article_id'][test_idx], ',# sentences:',len(articles[x['article_id'][test_idx]]),":",p,r)
    ps.append(p)
    rs.append(r)
#     break
    
#     pred_counter = 0
#     pred_similarity = []
#     ground_truth_matching_sent = []
#     pred_claim_sent = []
    
# #     print('number of claims in ground truth:',len(cl))
#     for i in range(len(top_N)):
#     #     print('===========***********',i,'***********============')
#         tN = nlp(top_N[i])
#         pred_claim_sent.append(top_N[i])
#     #     print(t5.vector)
#         for j in range(len(cl)):
#             _c = nlp(cl[j])
#     #         print(_c.vector)
#     #         print('top_5:',t5.text)
#     #         print('-------------------')
#     #         print('ground_truth:',_c.text)
#     #         print('t5:{0}, cl:{1}, sim: {2}'.format(i,j,np.around(t5.similarity(_c),4)))
#     #         print('===================================================================')
#             if np.around(tN.similarity(_c),4) > threshold:
#                 ground_truth_matching_sent.append(cl[j])
#                 pred_similarity.append(np.around(tN.similarity(_c),4))
#                 pred_counter+=1
#                 break
#         if pred_counter>=len(cl):
#             break
#         if len(ground_truth_matching_sent)!=len(pred_claim_sent):
#             ground_truth_matching_sent.append('None')
#             pred_similarity.append(0.0)
        

#     df = pd.DataFrame()
#     df['Headline'] = np.repeat(hd,len(top_N))
#     df['pred_claim_sent'] = pred_claim_sent
#     df['ground_truth_matching_sent'] = ground_truth_matching_sent
#     df['pred_similarity'] = pred_similarity
#     print(pred_counter)
#     display(df)
#     p = pred_counter/best_N
#     r = pred_counter/len(cl)
#     f = 2*(pred_counter/best_N)*(pred_counter/len(cl))/(0.0001+(pred_counter/best_N)+(pred_counter/len(cl)))
    
#     precisions.append(p)
#     recalls.append(r)
#     f1s.append(f)

    




    
#     counter+=1
#     if counter==5:
#         break
#     print("----------------------------")
#     for s in t:
#         if s>=len(x['sentences'][test_idx]):continue
#         x['sentences'][test_idx][s]


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

[ 9  2  0 12  3]
[2, 9, 10, 12, 13]
Clinton seemed to have scripted responses ready for every question she was asked at the first debate. sim: 0.843
----
Multiple reports and leaked information from inside the Clinton camp claim that the Clinton campaign was given the entire set of debate questions an entire week before the actual debate. sim: 0.8363
----
The first presidential debate was held and Hillary Clinton was proclaimed the winner by the media. sim: 0.8291
----
By furnishing Clinton with the debate questions NBC certainly hoped to make Clinton appear much more knowledgeable and competent than Trump. sim: 0.7977
----
Earlier last week an NBC intern was seen hand delivering a package to Clinton’s campaign headquarters, according to sources. sim: 0.777
----



In [16]:
np.average(ps), np.average(rs), 2*np.average(ps)*np.average(rs)/(np.average(ps)+ np.average(rs))

(0.521, 0.7343333333333334, 0.6095395645246947)

# Results

In [7]:
df_results = pd.DataFrame()
df_results['precision @{0}'.format(best_N)] = precisions
df_results['recall @{0}'.format(best_N)] = recalls
df_results['f1 @{0}'.format(best_N)] = f1s
# print('Recall:{0}, Precision:{1}, F_1:{2}'.format(np.around(recall,4),np.around(precision,4),np.around(f1,4)))
df_results.describe()

Unnamed: 0,precision @10,recall @10,f1 @10
count,50.0,50.0,50.0
mean,0.346,0.964952,0.501766
std,0.105386,0.114503,0.103388
min,0.2,0.4,0.266622
25%,0.3,1.0,0.461503
50%,0.3,1.0,0.461503
75%,0.4,1.0,0.571388
max,0.9,1.0,0.947319
