In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
%%time
from depechesAFP import *
from scipy import sparse
import numpy as np

CPU times: user 260 ms, sys: 12 ms, total: 272 ms
Wall time: 271 ms


In [4]:
def vocabulary():
    filename = "/home/bmazoyer/Documents/TwitterSea/vocabulary.json"
    stream = open(filename,"r+",encoding='utf-8')
    vocabulary = json.load(stream)
    stream.close()
    return vocabulary

In [5]:
def tfIdfMatrix(words, events):
    """shape a tf_idf matrix out of tweets belonging to each event.
    Return the matrix and a vocabulary of form {"word": index_in_matrix}
    :param words: "text" or "hashtags" --> analyze all words or only hashtags
    :param events: a list of ids. Ex: ['afp.com-20160922T065447Z-TX-PAR-HKZ50']
    """
    
    ref_dict = {
        'hashtags': hashtagsMatrix,
        'text': textMatrix
    }
    
    voc, X = ref_dict[words](events)
    
    transformer = feature_extraction.text.TfidfTransformer()
    tf_idf = transformer.fit_transform(X)
    
    return voc, tf_idf

In [6]:
%%time
def hashtagsMatrix(events):
    """shape a scipy.sparse count matrix out of hashtags refering to each event.
    Return the matrix and a vocabulary of form {"word": index_in_matrix}
    :param events: a list of ids. Ex: ['afp.com-20160922T065447Z-TX-PAR-HKZ50']"""
    
    hashtags = [{n['key']: n['doc_count'] for n in getFields(event, "hashtags")} for event in events]
     # hashtags = [{"Sarkozy":2, "Buisson":3}, {"Hollande":4, "Sarkozy":1, "HollandeDehors":5}]
    
    voc = {}
    for doc in hashtags:
        for m in doc:
            if m not in voc:
                voc[m] = len(voc)
    #voc = {"Sarkozy":0, "Buisson":1, "Hollande":2, "HollandeDehors":3}
    
    X = sparse.lil_matrix((len(hashtags),len(voc)), dtype=int)
    for i, doc in enumerate(hashtags):
        X[i, [voc[m] for m in doc]] += np.array(list(doc.values()))
    
   
    return voc, X
    

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.53 µs


In [7]:
def textMatrix(events):
    """shape a scipy.sparse count matrix out of words (1 to 3-grams) refering to each event.
    Return the matrix and a vocabulary of form {"word": index_in_matrix}
    :param events: a list of ids. Ex: ['afp.com-20160922T065447Z-TX-PAR-HKZ50']"""
    texts = [getText(event) for event in events]
    events_size = [len(texts[i]) for i in range(len(texts))]
    collection = (
        re.sub(
            r'https?\S+|@\S+', '',tweet['first-hit']['hits']['hits'][0]['_source']['text']
        ) for n in texts for tweet in n
    )
    doc_counts = sparse.diags([tweet['doc_count'] for n in texts for tweet in n], dtype=int)
    vectorizer = feature_extraction.text.CountVectorizer(
        stop_words = stopwords,
        min_df = 2,
        ngram_range = (1,1),
        binary = True
    )
#     doc_counts*CountVectorizer = total frequency of words including retweets
    X = doc_counts.dot(vectorizer.fit_transform(collection))
    
#     build matrix with an event on each row.
    row = []
    for i in range(len(events)):
        row.extend([i for n in range(events_size[i])])
    col = range(X.shape[0])
    dat = [1 for n in range(X.shape[0])]
    S = sparse.csr_matrix((dat, (row, col)), shape=(len(events), X.shape[0]))
    
    return vectorizer.get_feature_names() , S*X

In [133]:
events = filterEvents("20161002", 0)

In [134]:
vocab, X = textMatrix(events)

In [135]:
print(X.shape)

(115, 4447)


In [136]:
from __future__ import division, print_function

import numpy as np
import lda

model = lda.LDA(n_topics=X.shape[0], n_iter=15, random_state=1)
model.fit(X)

<lda.lda.LDA at 0x7f41eb8b0470>

In [137]:
topic_word = model.topic_word_
doc_topic = model.doc_topic_

In [138]:
n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))

*Topic 0
- fin mars londres brexit procédure
*Topic 1
- hongrie anti défenseurs europe dit
*Topic 2
- 30 procès temps droit via
*Topic 3
- santos soutien mettre suite salue
*Topic 4
- chasser gt maison selon oublié
*Topic 5
- rien in hui novembre bataclan
*Topic 6
- hongrie orban réfugiés référendum pays
*Topic 7
- beaucoup hongrie après mobilisation bien
*Topic 8
- refuse cc initiez 50 vendre
*Topic 9
- valls belfort alstom site manuel
*Topic 10
- matthew haïti jamaïque ouragan cuba
*Topic 11
- père hamel béatification pape procédure
*Topic 12
- devenu toussa 300km vents pense
*Topic 13
- neville marriner sir mort chef
*Topic 14
- polémique bosnie srebrenica périodes clés
*Topic 15
- jordanie accord gaz entre importation
*Topic 16
- orban hongrie référendum viktor migrants
*Topic 17
- autophagie japonais ohsumi yoshinori recherches
*Topic 18
- accorde syrien amp régime 80
*Topic 19
- cour matin cassation affaire contrôle
*Topic 20
- française contre morts après rassemblement
*Topic 21

In [158]:
for i in range(len(events)):
    topic_most_pr = doc_topic[i].argmax()
    topic_words = np.array(vocab)[np.argsort(topic_word[topic_most_pr])][:-(n+1):-1]
    print("doc: {} topic: {}\n{}\n{}".format(i,
                                            topic_most_pr,
                                            getEventsDetails(events[i])["events.text"][0],
                                            ' '.join(topic_words)))
    print([n['key'] for n in getFields(events[i],"tags")])
    print("")doc: 0

doc: 0 topic: 78
Hongrie urnes referendum anti refugies Premier ministre Orban
hongrie référendum orban viktor premier échec sommes fiers quotas
['Hongrie Orban', 'Hongrie urnes referendum anti refugies Premier ministre Orban', 'Hongrie revers electoral referendum Orban invalide abstention', 'Hongrie urnes referendum antirefugies Premier ministre Orban', 'Hongrie revers referendum antimigrants Orban invalide abstention']

doc: 1 topic: 78
Hongrie urnes referendum anti refugies Premier ministre Orban
hongrie référendum orban viktor premier échec sommes fiers quotas
['Hongrie Orban', 'Hongrie urnes referendum anti refugies Premier ministre Orban', 'Hongrie revers electoral referendum Orban invalide abstention', 'Hongrie urnes referendum antirefugies Premier ministre Orban', 'Hongrie revers referendum antimigrants Orban invalide abstention']

doc: 2 topic: 78
Hongrie urnes referendum anti refugies Premier ministre Orban
hongrie référendum orban viktor premier échec sommes fiers quotas
['H

['Hongrie Orban', 'Hongrie urnes referendum anti refugies Premier ministre Orban', 'Hongrie revers electoral referendum Orban invalide abstention', 'Hongrie urnes referendum antirefugies Premier ministre Orban', 'Hongrie revers referendum antimigrants Orban invalide abstention']
