In [6]:
from twnews.utils.memoize import load

dataset = load('dataset')

In [8]:
from twnews.utils.text_processors import Lemmatizer

lemmatizer = Lemmatizer()
index = 0
for _news in dataset.news.get_documents():
    _news.index = index
    index += 1

for tweet in dataset.tweets.get_documents():
    tweet.words = lemmatizer.split_text_to_lemmas(tweet.text)
    tweet.index = index
    index += 1

In [84]:
import heapq
from collections import defaultdict
from datetime import timedelta
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

from twnews.utils.memoize import load
from twnews.utils.text_processors import Lemmatizer, extract_entities
from twnews.utils.extra import timeit


@timeit
def get_tweet_to_tweet_hashtags_relation(tweets, k):
    def num_of_common_hashtags(tweet1, tweet2):
        hashtags1 = set(tweet1.hastags.keys())
        hashtags2 = set(tweet2.hastags.keys())
        return len(hashtags1 & hashtags2)
        
    def get_all_hashtags(tweets):
        lemmatizer = Lemmatizer()
        hashtags = {}
        for tweet in tweets:
            tweet_hashtags = map(lemmatizer.lemmatize, tweet.hastags.keys())
            for hashtag in tweet_hashtags:
                hashtags[hashtag] = hashtags[hashtag] + 1 if hashtag in hashtags else 1
        
        for h, v in hashtags.items():#sorted(hashtags.items(), key=lambda x: x[1]):
            if v > 10:
                del hashtags[h]
            
        return set(hashtags.keys())

    hashtags = get_all_hashtags(tweets)
    print 'hashtags:', len(hashtags)

    hashtag_to_tweets = defaultdict(list)
    for tweet in tweets:
        for word in tweet.words:
            if word in hashtags:
                tweet.hastags[word] = None
                hashtag_to_tweets[word].append(tweet)

    result = set()
    for tweet in tweets:
        for hashtag in tweet.hastags.keys():
            linked_tweets = hashtag_to_tweets[hashtag]
            linked_tweets = filter(lambda x: num_of_common_hashtags(x, tweet) >=2, linked_tweets)
            
            top_k = get_top_k_by_time(tweet, linked_tweets, k)
            if top_k:
                for elem in top_k:
                    result.add((tweet.index, elem.index))
    print 'tweet_to_tweet_hashtags', len(result)
    return result


@timeit
def get_tweet_to_tweet_NER_relation(tweets, NE_set, k):
    NE_to_tweets = defaultdict(list)

    for tweet in tweets:
        tweet.named_entities = []
        for word in tweet.words:
            if word in NE_set:
                tweet.named_entities.append(word)
                NE_to_tweets[word].append(tweet)

    result = set()
    for tweet in tweets:
        for entity in tweet.named_entities:
            linked_tweets = NE_to_tweets[entity]
            top_k = get_top_k_by_time(tweet, linked_tweets, k)
            if top_k:
                for elem in top_k:
                    result.add((tweet.index, elem.index))
    print 'tweet_to_tweet_NER', len(result)
    return result


@timeit
def get_document_to_documet_time_relation(documents, k):
    similarity = get_similarity_matrix(documents, documents)

    result = set()
    for d1_index, d1 in enumerate(documents):
        related_documents = []
        for d2_index, d2 in enumerate(documents):
            if document_date_distanse(d1,d2) < timedelta(hours=24):
                related_documents.append((d2, similarity[d1_index][d2_index]))

        top_k = heapq.nlargest(k, related_documents, key=lambda x: x[1])
        if top_k:
            for elem, sim in top_k:
                result.add((d1.index, elem.index))
    print 'document_to_documet_time_relation', len(result)
    return result


def get_NE_from_news(news_documents):
    lemmatizer = Lemmatizer()
    result = set()
    errors = 0
    for i, news in enumerate(news_documents):
        try:
            ents = extract_entities(lemmatizer, news.summary)
        except Exception as e:
            errors += 1
            ents = set()
        result.update(ents)
    print 'NE:', len(result), 'errors:', errors
    return result


def get_similarity_matrix(documents_1, documents_2):
    corpus, tf_idf_matrix = load('tf_idf_corpus')

    def convert_to_compare_matrix(documents):
        dim = len(corpus)

        data, row_idxs, column_idxs = [], [], []
        for column_idx, document in enumerate(documents):
            rows, _, values = sparse.find(tf_idf_matrix[:,document.index])
            for i, value in enumerate(values):
                data.append(values[i])
                row_idxs.append(rows[i])
                column_idxs.append(column_idx)

        compare_matrix = sparse.csr_matrix((data, (row_idxs, column_idxs)), shape=(dim, len(documents)))
        return compare_matrix

    matrix_1 = convert_to_compare_matrix(documents_1)
    print 'matrix 1 builded'

    matrix_2 = convert_to_compare_matrix(documents_2)
    print 'matrix 2 builded'

    mat = cosine_similarity(matrix_1.T, matrix_2.T)
    print 'similarity matrix builded'
    return mat


def document_date_distanse(d1, d2):
    return max(d1.date, d2.date) - min(d1.date, d2.date)



from datetime import timedelta

def get_top_k_by_time(base_document, documents, k):
    documents = filter(lambda x: x.index != base_document.index, documents)
    documents = filter(lambda x: document_date_distanse(x, base_document) < timedelta(hours=12), documents)
    documents = sorted(documents, key=lambda x: document_date_distanse(x, base_document))
    return documents[:k]

In [94]:
def get_text_to_text_relation(news, tweets, k=3):
    """result is a list of index pairs in array news+tweets"""
    # tweets = tweets[:500]
    # news = news[:500]
    
    #tweet_to_tweet_hashtags = get_tweet_to_tweet_hashtags_relation(tweets, k)
    NE_set = get_NE_from_news(news)
    tweet_to_tweet_NER = get_tweet_to_tweet_NER_relation(tweets, NE_set, k)
#     tweet_to_tweet_time = get_document_to_documet_time_relation(tweets, k)
#     news_to_news_time = get_document_to_documet_time_relation(news, k)

    #total_relations = tweet_to_tweet_hashtags | tweet_to_tweet_NER | tweet_to_tweet_time | news_to_news_time
    total_relations = tweet_to_tweet_NER
    print total_relations
    return filter(lambda x: x[0] != x[1], total_relations)

text_to_text_links = get_text_to_text_relation(dataset.news.get_documents()[:100], dataset.tweets.get_documents())

NE: 186 errors: 9
tweet_to_tweet_NER 2341
set([(14320, 14896), (15222, 14189), (14238, 14414), (14275, 13812), (14469, 15245), (14445, 13783), (15187, 15054), (14490, 14759), (14455, 14233), (15262, 15051), (14429, 15133), (14659, 14445), (15012, 15023), (14896, 13732), (13930, 14867), (14308, 15031), (14269, 13952), (14695, 15307), (15158, 14824), (13895, 13902), (14031, 14261), (14673, 14012), (14895, 14556), (14530, 15254), (15113, 15186), (14744, 15192), (14031, 14704), (14990, 13864), (13888, 14679), (15153, 14045), (13776, 14467), (15222, 14490), (14623, 15031), (14467, 13776), (14892, 13879), (15307, 15245), (14279, 13781), (14986, 14913), (14910, 14049), (14508, 13938), (14374, 15031), (13851, 14859), (13937, 14896), (14146, 14675), (15031, 14623), (14680, 14405), (14522, 14245), (14073, 15067), (14182, 14646), (14791, 14909), (14586, 14892), (14978, 13863), (14723, 14400), (14331, 13807), (14774, 13838), (14407, 14277), (14056, 13970), (14530, 14370), (14741, 14439), (14320, 1

In [95]:
documents = dataset.get_documents()
for x,y in text_to_text_links[:10]:
    print documents[x].date, documents[x]
    print ' '.join(documents[x].hastags.keys())
    print documents[y].date, documents[y]
    print ' '.join(documents[y].hastags.keys())
    print '--------------'
    

2016-04-12 10:48:14+00:00 Боливия отменяет визы для туристов из РФ -  
рф
2016-04-12 13:05:56+00:00 Мораторий Украины на выплату $3 млрд долга перед РФ стал бессрочным: Верховная Рада голосами...  #политика
политика
--------------
2016-04-06 08:29:26+00:00 #Киров #news #Вятка Россия ответила на угрозы США наложить вето на поставки оружия Ирану #вмире
Вятка вмире вмиро сша вятка news Киров киров
2016-04-06 07:12:42+00:00 #Футбол Мутко не исключил введения видеоповторов в чемпионате России 
россия Футбол футбол
--------------
2016-04-13 18:27:48+00:00 #runews Эрнест: Белый дом озабочен «опасным сближением» Су-24 с эсминцем США: Вашингтон озабочен сообщениями об "опасном сбли...  #yanews
runews дом сша yanews
2016-04-13 19:45:50+00:00 В Белом доме обеспокоены пролетом Су-24 над эсминцем США: Власти США заявили, что инцидент с российскими самол... 
власть дом сша
--------------
2016-04-15 14:18:08+00:00 Порошенко: рост контрактников в армии позволит отсрочить мобилизацию
армия
2016-04-15 1