In [1]:
from testdataextractor.testdataextractor.extractor import Extractor
from summpy.summpy import lexrank
import pandas as pd

## Get some data

In [16]:

testarticles = [1957284403,1965754064,233465322,283147769,362778020,37793736,389321649,540607195,60134403,887344770, ]
all_articles = []
all_sets_sentences = []
for art in testarticles:
    ext = Extractor("../test_data/{0}.ofs.gold.xml".format(art))
    article = ext.extract()
    all_articles.append(article)
    df_article = pd.DataFrame.from_dict(article['sentences'], orient='index')
    sorted_indexes = [ "s{0}".format(x) for x in range(len(article['sentences'].values()))]
    sentences = list(df_article.ix[sorted_indexes, 'text'])
    print len(sentences), "sentences extracted."
    if df_article.ix['s2', 'text'] == sentences[2]:
        print "Extracted list of sentences is in a proper order."
        all_sets_sentences.append(sentences)
    else:
        print "Extracted list of sentences is unordered."

190 sentences extracted.
Extracted list of sentences is in a proper order.
203 sentences extracted.
Extracted list of sentences is in a proper order.
290 sentences extracted.
Extracted list of sentences is in a proper order.
178 sentences extracted.
Extracted list of sentences is in a proper order.
170 sentences extracted.
Extracted list of sentences is in a proper order.
184 sentences extracted.
Extracted list of sentences is in a proper order.
278 sentences extracted.
Extracted list of sentences is in a proper order.
156 sentences extracted.
Extracted list of sentences is in a proper order.
196 sentences extracted.
Extracted list of sentences is in a proper order.
178 sentences extracted.
Extracted list of sentences is in a proper order.


## Feed data into lexrank
The summpy MIT licensed repo used for this task, returns a tuple with a dictionary with sentences index + score, and the similarity_matrix.

In [22]:
all_ranked_sentences = []
all_matrxs = []
for sentences in all_sets_sentences:
    ranked_sentences, similarity_mtrx = lexrank.lexrank(sentences)
    all_ranked_sentences.append(ranked_sentences)
    all_matrxs.append(similarity_mtrx)

## See the data

In [23]:
all_ranked_dfs = []
all_similarity_dfs = []
for ranked_sentences, similarity_mtrx in zip(all_ranked_sentences, all_matrxs):
    df_ranked_sentences = pd.DataFrame.from_dict(ranked_sentences,
                                                 orient='index')
    df_similarity_mtrx = pd.DataFrame(similarity_mtrx)
    all_ranked_dfs.append(df_ranked_sentences)
    all_similarity_dfs.append(df_similarity_mtrx)



In [40]:
all_top_sents = []
all_bot_sents = []
for df_ranked_sentences in all_ranked_dfs:
    mean_score = df_ranked_sentences.mean(axis=0)
    min_score = df_ranked_sentences.min(axis=0)
    tresh = mean_score*.90 + min_score*.1
    top_sent = list(df_ranked_sentences[df_ranked_sentences > tresh].dropna().index)
    bottom_sent = list(df_ranked_sentences[df_ranked_sentences < tresh].dropna().index)
    all_top_sents.append(top_sent)
    all_bot_sents.append(bottom_sent)
    print "{0} top sentences and {1} bottom_sentences".format(len(top_sent), len(bottom_sent))

132 top sentences and 58 bottom_sentences
146 top sentences and 57 bottom_sentences
205 top sentences and 85 bottom_sentences
124 top sentences and 54 bottom_sentences
115 top sentences and 55 bottom_sentences
125 top sentences and 59 bottom_sentences
189 top sentences and 89 bottom_sentences
107 top sentences and 49 bottom_sentences
135 top sentences and 61 bottom_sentences
117 top sentences and 61 bottom_sentences


## Compare with link information
* How many of the links are made entirely of top sentences?
* How many are made of top and bottom sentences? 
* How many of just bottom sentences?

Answering these questions will probably allow me to know if the saliency of these sentences given by lexrank is a good feature for finding links, or if it is just useful for finding the most important ones and that's it.

In [41]:
for top_sent, bottom_sent, article in zip(all_top_sents, all_bot_sents, all_articles):
    top_sent_set = { "s{0}".format(s) for s in top_sent }
    bot_sent_set = { "s{0}".format(s) for s in bottom_sent }

    both_top = 0
    one_top = 0
    both_bottom = 0
    other = 0
    link_dicts = article['links'].values()
    total = len(link_dicts)*1.0
    for l in link_dicts:
        s_art = l['art_sentence']
        s_com = l['com_sentence']
        if s_art in top_sent_set and s_com in top_sent_set:
            both_top += 1
            one_top += 1
        elif s_art in bot_sent_set and s_com in bot_sent_set:
            both_bottom += 1
        else:
            other += 1
            one_top += 1

    print "TOP: {0}, BOTTOM: {1}\nMIXED: {2}, AT LEAST ONE TOP: {3}\n\n"\
    .format(both_top/total,both_bottom/total,other/total, one_top/total)

TOP: 0.435714285714, BOTTOM: 0.1
MIXED: 0.464285714286, AT LEAST ONE TOP: 0.9


TOP: 0.5, BOTTOM: 0.0882352941176
MIXED: 0.411764705882, AT LEAST ONE TOP: 0.911764705882


TOP: 0.48743718593, BOTTOM: 0.100502512563
MIXED: 0.412060301508, AT LEAST ONE TOP: 0.899497487437


TOP: 0.489130434783, BOTTOM: 0.0869565217391
MIXED: 0.423913043478, AT LEAST ONE TOP: 0.913043478261


TOP: 0.556962025316, BOTTOM: 0.0886075949367
MIXED: 0.354430379747, AT LEAST ONE TOP: 0.911392405063


TOP: 0.0, BOTTOM: 0.0
MIXED: 1.0, AT LEAST ONE TOP: 1.0


TOP: 0.6, BOTTOM: 0.04
MIXED: 0.36, AT LEAST ONE TOP: 0.96


TOP: 0.538461538462, BOTTOM: 0.115384615385
MIXED: 0.346153846154, AT LEAST ONE TOP: 0.884615384615


TOP: 0.357142857143, BOTTOM: 0.214285714286
MIXED: 0.428571428571, AT LEAST ONE TOP: 0.785714285714


TOP: 0.5, BOTTOM: 0.0
MIXED: 0.5, AT LEAST ONE TOP: 1.0




### It can be concluded, that most of the times, links contain at least one top ranked sentence.
This means that this can be used to just classify pairs that contain at least one top ranked sentence. Top ranked sentences are the ones that are above a certain treshold which depends on the min value and the mean value. 

## So now I calculate the pairs

In [52]:
from itertools import product
all_pairs = []
for top_sent, bottom_sent in zip(all_top_sents, all_bot_sents):
    top_sent_set = { "s{0}".format(s) for s in top_sent }
    bot_sent_set = { "s{0}".format(s) for s in bottom_sent }
    
    pairs = list(product(top_sent_set, bot_sent_set))
    all_pairs.append(pairs)
    print len(pairs)

7656
8322
17425
6696
6325
7375
16821
5243
8235
7137


### I can further prune this list
Remove all pairs that have the same comment, or are both from the article.

In [55]:
all_pruned_pairs = []
for index, pairs in enumerate(all_pairs):
    art = all_articles[index]
    sents = art['sentences']
    pruned = [p for p in pairs 
              if sents[p[0]].get('comment', 'none') != 
                 sents[p[1]].get('comment', 'none')]
    all_pruned_pairs.append(pruned)
    print len(pruned)

7019
7632
16735
6399
6185
7164
15950
4960
7856
6808
