In [2]:
import sklearn.preprocessing as preprocess
import numpy as np
import pandas as pd
from keras import backend as K
import spacy
from spacy.lang.en import English
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 1000)
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [3]:
def load_spacy():
    sentencizer = English()
    sentencizer.add_pipe(sentencizer.create_pipe('sentencizer'))
    nlp = spacy.load("en_core_web_md")
    return sentencizer, nlp
sentencizer, nlp = load_spacy()

In [4]:
df_hd_tp = pd.read_csv('evaluation_set/cdc_ibm/headline_topic_mapping.csv')
df_ar_cl = pd.read_csv('evaluation_set/cdc_ibm/article_claim_mapping.csv')
df_hd_tp.keys(),df_ar_cl.keys()

(Index(['Topic', 'Title', 'article Id', 'Headline'], dtype='object'),
 Index(['Unnamed: 0', 'Topic', 'Article', 'Claim'], dtype='object'))

# Article ID 2203 and 1819 are exactly the same

In [5]:
display(df_hd_tp[['article Id','Title']].sort_values(by='Title'))

Unnamed: 0,article Id,Title
295,2662,2007 Burmese anti-government protests
405,961,ARTICLE 19
517,3129,ASEAN Free Trade Area
180,2183,ATF gunwalking scandal
154,2147,Abortion and mental health
153,2145,Abortion debate
152,2150,Abortion in the United States
341,3134,"Abstinence, be faithful, use a condom"
340,3132,Abstinence-only sex education
410,1110,Access to Information Act


In [6]:
df_hd_tp[df_hd_tp.Title=='Treaty on the Non-Proliferation of Nuclear Weapons']

Unnamed: 0,Topic,Title,article Id,Headline
186,This house believes the US is justified in using force to prevent states from acquiring nuclear weapons,Treaty on the Non-Proliferation of Nuclear Weapons,2203,US is justified in using force to prevent states from acquiring nuclear weapons
474,This house believes all nations have a right to nuclear weapons,Treaty on the Non-Proliferation of Nuclear Weapons,1819,All nations have a right to nuclear weapons


In [7]:
df_ar_cl[df_ar_cl.Article=='Treaty on the Non-Proliferation of Nuclear Weapons']

Unnamed: 0.1,Unnamed: 0,Topic,Article,Claim
575,575,all nations have a right to nuclear weapons,Treaty on the Non-Proliferation of Nuclear Weapons,Having more nuclear nuclear-weapon states would reduce security for all
576,576,all nations have a right to nuclear weapons,Treaty on the Non-Proliferation of Nuclear Weapons,nuclear forces continue to play an essential role in war prevention
815,815,the US is justified in using force to prevent states from acquiring nuclear weapons,Treaty on the Non-Proliferation of Nuclear Weapons,the NPT cannot stop the proliferation of nuclear weapons or the motivation to acquire them
816,816,the US is justified in using force to prevent states from acquiring nuclear weapons,Treaty on the Non-Proliferation of Nuclear Weapons,Having more nuclear nuclear-weapon states would reduce security for all


# Evaluation code

In [8]:
precisions=[]
recalls=[]
f1s=[]
best_N = 10
threshold = 0.95
for title in tqdm_notebook(df_hd_tp.Title.unique()):
    hd = df_hd_tp[df_hd_tp.Title==title]['Headline'].values[0]
    ar_id = df_hd_tp[df_hd_tp.Title==title]['article Id'].values[0]
    cl = df_ar_cl[df_ar_cl.Article==title]['Claim'].values
    if len(cl)<1:
        continue
#     print('ar_id:',ar_id)
#     print('hd:',hd)
#     print('cl_1:',cl[0])
    with open('evaluation_set/cdc_ibm/articles/clean_{0}.txt'.format(ar_id)) as f:
        article = f.readlines()
    # sentences = article.split('\n')
    article=''.join(article)
    # print(article)
    article = article.replace('[REF]','')
    doc = sentencizer(article)
    sentences = [sent.string.strip() for sent in doc.sents]
    tokens_hd = nlp(hd)
    sims = []
    for i in range(len(sentences)):
        tokens_s = nlp(sentences[i])
        sims.append(tokens_hd.similarity(tokens_s))
    
    top_N = []
    for idx in np.array(sims).argsort()[-best_N:][::-1]:
        top_N.append(sentences[idx])
    pred_counter = 0
    pred_similarity = []
    ground_truth_matching_sent = []
    pred_claim_sent = []
    for i in range(len(top_N)):
    #     print('===========***********',i,'***********============')
        t5 = nlp(top_N[i])
        pred_claim_sent.append(top_N[i])
    #     print(t5.vector)
        for j in range(len(cl)):
            _c = nlp(cl[j])
    #         print(_c.vector)
    #         print('top_5:',t5.text)
    #         print('-------------------')
    #         print('ground_truth:',_c.text)
    #         print('t5:{0}, cl:{1}, sim: {2}'.format(i,j,np.around(t5.similarity(_c),4)))
    #         print('===================================================================')
            if np.around(t5.similarity(_c),4) > threshold:
                ground_truth_matching_sent.append(cl[j])
                pred_similarity.append(np.around(t5.similarity(_c),4))
                pred_counter+=1
                break
        if len(ground_truth_matching_sent)!=len(pred_claim_sent):
            ground_truth_matching_sent.append('None')
            pred_similarity.append(0.0)
        if pred_counter>=len(cl):
            break
    p = pred_counter/best_N
    r = pred_counter/len(cl)
    f = 2*(pred_counter/best_N)*(pred_counter/len(cl))/(0.0001+(pred_counter/best_N)+(pred_counter/len(cl)))
    
    precisions.append(p)
    recalls.append(r)
    f1s.append(f)




HBox(children=(IntProgress(value=0, max=522), HTML(value='')))




# Results

In [9]:
df_results = pd.DataFrame()
df_results['precision @{0}'.format(best_N)] = precisions
df_results['recall @{0}'.format(best_N)] = recalls
df_results['f1 @{0}'.format(best_N)] = f1s
# print('Recall:{0}, Precision:{1}, F_1:{2}'.format(np.around(recall,4),np.around(precision,4),np.around(f1,4)))
df_results.describe()

Unnamed: 0,precision @10,recall @10,f1 @10
count,299.0,299.0,299.0
mean,0.068562,0.196003,0.085721
std,0.10998,0.319543,0.124921
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.1,0.263889,0.166639
max,0.7,1.0,0.705834


In [9]:
# counter = 0
# for title in tqdm_notebook(df_hd_tp.Title.unique()):
#     hd = df_hd_tp[df_hd_tp.Title==title]['Headline'].values[0]
#     ar_id = df_hd_tp[df_hd_tp.Title==title]['article Id'].values[0]
#     cl = df_ar_cl[df_ar_cl.Article==title]['Claim'].values
#     if len(cl)<1:
#         counter+=1
#         print(ar_id)
        
# counter