In [1]:
# We now obtain probability distributions over topics, by which we can cluster news into groups under topics.
# This script evalute how this clustering works, i.e., how many relevant documents are in a cluster.

In [2]:
import pandas as pd
import json
import xml.etree.ElementTree as ET
import numpy as np

In [3]:
# load qrels
qrel = pd.read_csv("../../query_topics/2018BL_answer.txt", sep = " ", header = None)
qrel.columns = ["topicid", "q0", "newsid", "rel"]

# load <topicid, newsid> pair from the xml file
tree = ET.parse('../../query_topics/2018BL_topic.xml')
root = tree.getroot()
# topic to news
t2n = {}
for i in root:
    topicid = i[0].text[-3:]
    newsid = i[1].text
    t2n[topicid] = newsid
    
# load clustering results
with open("../../topic_collection.json", "r") as f:
    cluster = json.load(f)

In [91]:
def count_rel_cluster():
    
    '''
    Count how many relevant documents are in the cluster
    '''
    
    recall = 0
    no_zero_rel = 0
    
    for k,v in t2n.items():
        print(k)
        n_rel_16 = qrel.loc[qrel.topicid == int(k)].loc[qrel.rel==16].shape[0]
        n_rel_8 = qrel.loc[qrel.topicid == int(k)].loc[qrel.rel==8].shape[0]
        n_rel_4 = qrel.loc[qrel.topicid == int(k)].loc[qrel.rel==4].shape[0]
        print("Total number of news with rel score of 16&8&4 is: {}".format(n_rel_16+n_rel_8+n_rel_4))
        rel_news_list = list(qrel.loc[qrel.topicid == int(k)].loc[qrel.rel==16].newsid) + \
                        list(qrel.loc[qrel.topicid == int(k)].loc[qrel.rel==8].newsid) + \
                        list(qrel.loc[qrel.topicid == int(k)].loc[qrel.rel==4].newsid)
        
        for topic_no in range(0,70):
            # find the cluster where the query news is in
            if v in cluster[str(topic_no)]:
                news_in_cluster = cluster[str(topic_no)]
                rel_news_in_cluster = len(set(rel_news_list) & set(news_in_cluster))
                print("Total number of news with rel score of 16&8&4 in cluster is: {}".format(rel_news_in_cluster))
                
                # calculate recall
                if (n_rel_16+n_rel_8+n_rel_4) == 0:
                    recall_topic = 0
                else:
                    recall_topic = rel_news_in_cluster/(n_rel_16+n_rel_8+n_rel_4)
                    recall += recall_topic
                    no_zero_rel += 1
                print(recall_topic)
                
    print(recall/no_zero_rel)

In [92]:
count_rel_cluster()

826
Total number of news with rel score of 16&8&4 is: 13
Total number of news with rel score of 16&8&4 in cluster is: 3
0.23076923076923078
827
Total number of news with rel score of 16&8&4 is: 14
Total number of news with rel score of 16&8&4 in cluster is: 0
0.0
828
Total number of news with rel score of 16&8&4 is: 1
Total number of news with rel score of 16&8&4 in cluster is: 1
1.0
829
Total number of news with rel score of 16&8&4 is: 7
Total number of news with rel score of 16&8&4 in cluster is: 7
1.0
830
Total number of news with rel score of 16&8&4 is: 1
Total number of news with rel score of 16&8&4 in cluster is: 0
0.0
831
Total number of news with rel score of 16&8&4 is: 12
Total number of news with rel score of 16&8&4 in cluster is: 4
0.3333333333333333
832
Total number of news with rel score of 16&8&4 is: 4
Total number of news with rel score of 16&8&4 in cluster is: 0
0.0
833
Total number of news with rel score of 16&8&4 is: 26
Total number of news with rel score of 16&8&4 in