In [1]:
# -*- coding: utf-8 -*-
%matplotlib inline


import matplotlib
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from twnews.utils.memoize import load
from twnews.utils.text_processors import Lemmatizer
from nltk.tokenize import word_tokenize
from collections import defaultdict

dataset = load('dataset')
stopwords = stopwords.words('russian') + stopwords.words('english')
lemmatizer = Lemmatizer()

tweets = dataset.tweets.get_dataset_texts()
news = dataset.news.get_dataset_texts()

index = 0
for _news in news:
    _news.index = index
    index += 1

for tweet in tweets:
    tweet.words = lemmatizer.split_text_to_lemmas(tweet.text)
    tweet.index = index
    index += 1

print len(tweets), len(news)

4324 13711


In [2]:
def get_all_hashtags(tweets):
    hashtags = set()
    for tweet in tweets:
        hashtags.update(map(lemmatizer.lemmatize, tweet.hastags.keys()))
    return hashtags

def document_date_distanse(d1, d2):
    return max(d1.date, d2.date) - min(d1.date, d2.date)

def get_top_k_by_time(base_document, documents, k):
    documents = filter(lambda x: x.index != base_document.index, documents)
    documents = sorted(documents, key=lambda x: document_date_distanse(x, base_document))
    return documents[:k]

def get_tweet_to_tweet_hashtags_relation(tweets, k):
    hashtags = get_all_hashtags(tweets)
    print 'hashtags:', len(hashtags)

    hashtag_to_tweets = defaultdict(list)      
    for tweet in tweets:
        for word in tweet.words:
            if word in hashtags:
                tweet.hastags[word] = None
                hashtag_to_tweets[word].append(tweet)
    
    result = set()
    for tweet in tweets:
        for hashtag in tweet.hastags.keys():
            linked_tweets = hashtag_to_tweets[hashtag]
            top_k = get_top_k_by_time(tweet, linked_tweets, k)
            if top_k:
                for elem in top_k:
                    result.add((tweet.index, elem.index))
    print 'tweet_to_tweet_hashtags', len(result)
    return result

def get_tweet_to_tweet_NER_relation(tweets, NE_set, k):
    NE_to_tweets = defaultdict(list)
    
    for tweet in tweets:
        tweet.named_entities = []
        for word in tweet.words:
            if word in NE_set:
                tweet.named_entities.append(word)
                NE_to_tweets[word].append(tweet)
    
    result = set()
    for tweet in tweets:
        for entity in tweet.named_entities:
            linked_tweets = NE_to_tweets[entity]
            top_k = get_top_k_by_time(tweet, linked_tweets, k)
            if top_k:
                for elem in top_k:
                    result.add((tweet.index, elem.index))
    print 'tweet_to_tweet_NER', len(result)
    return result

In [3]:
from polyglot.text import Text

def extract_entities(text):
    t = Text(text)
    t.language = 'ru'

    result = set()
    for entity in t.entities:
        result.update([lemmatizer.lemmatize(x) for x in entity])
    return result

def get_NE_from_news(news_documents):
    result = set()
    errors = 0
    for i, news in enumerate(news_documents):
        try:
            ents = extract_entities(news.summary)
        except Exception as e:
            errors += 1
            ents = set()
        result.update(ents)
    print 'NE:', len(result), 'errors:', errors
    return result

In [4]:
corpus, tf_idf_matrix = load('tf_idf_corpus')

def get_similarity_matrix(documents_1, documents_2):
    from sklearn.metrics.pairwise import cosine_similarity
    from scipy import sparse
    def convert_to_compare_matrix(documents):
        dim = len(corpus)

        data, row_idxs, column_idxs = [], [], []
        for column_idx, document in enumerate(documents):
            rows, _, values = sparse.find(tf_idf_matrix[:document.index])
            for i, value in enumerate(values):
                data.append(values[i])
                row_idxs.append(rows[i])
                column_idxs.append(column_idx)

        compare_matrix = sparse.csr_matrix((data, (row_idxs, column_idxs)), shape=(dim, len(documents)))
        return compare_matrix

    matrix_1 = convert_to_compare_matrix(documents_1)
    print 'matrix 1 builded'
    
    matrix_2 = convert_to_compare_matrix(documents_2)
    print 'matrix 2 builded'

    mat = cosine_similarity(matrix_1.T, matrix_2.T)
    print 'similarity matrix builded'
    return mat

In [8]:
from datetime import timedelta
import heapq

def get_document_to_documet_time_relation(documents, k):
    similarity = get_similarity_matrix(documents, documents)
    result = set()
    for d1_index, d1 in enumerate(documents):
        related_documents = []
        for d2_index, d2 in enumerate(documents):
            if document_date_distanse(d1,d2) < timedelta(hours=24):
                #print d1_index, d2_index
                #print similarity[d1_index][d2_index]
                related_documents.append((d2, similarity[d1_index][d2_index]))
        
        top_k = heapq.nlargest(k, related_documents, key=lambda x: x[1])        
        if top_k:
            for elem, sim in top_k:
                result.add((d1.index, elem.index))
    print 'document_to_documet_time_relation', len(result)
    return result
    

def get_text_to_text_relation(news, tweets, k=3):
    """result is a list of index pairs in array news+tweets"""
    tweets = tweets[:100]
    news = news[:100]
    
    tweet_to_tweet_hashtags = get_tweet_to_tweet_hashtags_relation(tweets, k)
    NE_set = get_NE_from_news(news)
    tweet_to_tweet_NER = get_tweet_to_tweet_NER_relation(tweets, NE_set, k)
    tweet_to_tweet_time = get_document_to_documet_time_relation(tweets, k)
    news_to_news_time = get_document_to_documet_time_relation(news, k)
    
    total_relations = tweet_to_tweet_hashtags | tweet_to_tweet_NER | tweet_to_tweet_time | news_to_news_time
    
    return filter(lambda x: x[0] != x[1], total_relations)
res = get_text_to_text_relation(news, tweets)

hashtags: 21
tweet_to_tweet_hashtags 95
NE: 186 errors: 9
tweet_to_tweet_NER 128
matrix 1 builded
matrix 2 builded
similarity matrix builded
document_to_documet_time_relation 300
matrix 1 builded
matrix 2 builded
similarity matrix builded
document_to_documet_time_relation 300


In [7]:
print res


set([(13738, 13739), (13771, 13771), (13766, 13766), (13751, 13750), (13804, 13804), (13722, 13724), (75, 73), (13783, 13801), (13736, 13737), (13806, 13807), (13792, 13791), (13770, 13771), (13791, 13792), (79, 78), (13786, 13785), (13787, 13789), (13747, 13746), (38, 40), (13748, 13749), (13775, 13771), (59, 60), (13758, 13758), (55, 54), (13789, 13746), (3, 6), (46, 46), (13726, 13725), (73, 73), (57, 59), (84, 84), (13791, 13763), (55, 56), (13738, 13756), (59, 61), (13809, 13746), (13768, 13769), (13789, 13789), (13750, 13750), (13776, 13801), (13788, 13788), (9, 9), (13722, 13739), (13746, 13759), (34, 35), (13775, 13776), (13760, 13762), (15, 13), (13803, 13732), (13, 13), (13753, 13753), (80, 79), (4, 12), (0, 4), (13729, 13728), (91, 91), (1, 33), (13772, 13774), (86, 84), (58, 59), (13724, 13726), (20, 21), (13798, 13737), (98, 97), (13802, 13804), (41, 43), (13738, 13761), (18, 18), (13716, 13714), (13745, 13745), (10, 9), (13782, 13763), (77, 76), (13756, 13755), (13714, 13