## Import packages

In [1]:
import sys

In [2]:
sys.path.append('../data_helpers/')
sys.path.append('../preprocess')
sys.path.append('../cluster/')

In [3]:
from twitter_data_helper import TwitterDataHelper
from text_cleaner import TextCleaner
from lda_cluster import LDACluster

Using TensorFlow backend.


## Get data

In [16]:
data_helper = TwitterDataHelper()
df = data_helper.get_data(['2017-07-05', '2017-07-06', '2017-07-07', '2017-07-08', '2017-07-09', '2017-07-10'])

In [17]:
df

Unnamed: 0,source,created_at,author,text,raw_data
0,twitter,2017-07-07 21:17:08,jakewarrentx,Wow. This is the second time in two weeks they...,"Status(in_reply_to_screen_name=None, place=Non..."
1,twitter,2017-07-10 16:08:09,stanfordnlp,This is a lovely use of our new CoreNLP server...,"Status(in_reply_to_screen_name=None, place=Non..."
2,twitter,2017-07-06 20:58:52,stanfordnlp,SNLI—https://t.co/Ybc3Z2RXLM—good for general ...,"Status(in_reply_to_screen_name=None, place=Non..."
3,twitter,2017-07-06 20:50:16,stanfordnlp,Our news summary: #NLProc DyNet sneaking into ...,"Status(in_reply_to_screen_name=None, place=Non..."
4,twitter,2017-07-05 15:41:02,ncsc,Did you hear Chris Ensor on #R4Today discussin...,"Status(in_reply_to_screen_name=None, place=Non..."
5,twitter,2017-07-05 14:26:03,ncsc,"Our shiny, new Chrome OS guidance. Tap into ou...","Status(in_reply_to_screen_name=None, place=Non..."
6,twitter,2017-07-05 14:00:46,ncsc,Meet one girl who impressed our team so much s...,"Status(in_reply_to_screen_name=None, place=Non..."
7,twitter,2017-07-05 12:30:38,ncsc,How's everyone getting on? If you've cracked o...,"Status(in_reply_to_screen_name=None, place=Non..."
8,twitter,2017-07-05 11:07:02,ncsc,See how implementing our #passwordguidance hel...,"Status(in_reply_to_screen_name=None, place=Non..."
9,twitter,2017-07-05 10:01:26,ncsc,"John Humphrys called it ‘baffling’, but can yo...","Status(in_reply_to_screen_name=None, place=Non..."


## Get text from data

In [18]:
texts = df['text']

In [19]:
len(texts)

878

## Clean text

In [20]:
text_cleaner = TextCleaner(filter_sentiment_words=True)

* [TextCleaner] Initializing...
* [TextCleaner] Loading SpaCy "en_core_web_md" corpus...
* [TextCleaner] Loading stopwords...
* [TextCleaner] Loading sentinent words...
--------------------------------------------------------------------------------------------------------------------


In [21]:
docs = text_cleaner.clean(texts)

0it [00:00, ?it/s]

* [TextCleaner] Cleaning text...


878it [00:00, 1337.14it/s]


## Let's cluster text

### First, train your LDA. Once model is trained, feature vectors and cluster labels of training data is stored.

In [26]:
cluster = LDACluster(num_topics=100)

In [27]:
cluster.fit(docs)

* [LDA] Training model...


<lda_cluster.LDACluster at 0x7f5e9d657278>

## Get feature vectors and cluster labels

In [28]:
df['feature_vector'] = [feature_vector for feature_vector in cluster.feature_vectors]

In [29]:
df['cluster'] = cluster.labels

## Observe topics

In [30]:
for i in range(cluster.model.num_topics):
    print('Topic {}'.format(i))
    print(cluster.model.print_topic(i))
    print()

Topic 0
0.063*"neural" + 0.063*"machine" + 0.063*"translation" + 0.063*"machine_translation" + 0.063*"single" + 0.063*"queue" + 0.063*"decoding_neural" + 0.063*"single_queue" + 0.063*"decoding" + 0.013*"content"

Topic 1
0.032*"author" + 0.030*"make" + 0.015*"amp" + 0.015*"learn" + 0.015*"datum" + 0.015*"really" + 0.015*"update" + 0.015*"could" + 0.015*"brain" + 0.015*"today"

Topic 2
0.040*"networks" + 0.040*"test" + 0.040*"slide" + 0.021*"amp" + 0.021*"neural" + 0.020*"ransomware" + 0.020*"end" + 0.020*"public" + 0.020*"small" + 0.020*"study"

Topic 3
0.026*"get" + 0.014*"new" + 0.013*"security" + 0.013*"make" + 0.013*"word" + 0.013*"approach" + 0.013*"show" + 0.013*"really" + 0.013*"detail" + 0.013*"find"

Topic 4
0.035*"symantec" + 0.035*"pxe" + 0.019*"use" + 0.018*"amp" + 0.018*"learning" + 0.018*"know" + 0.018*"petya" + 0.018*"still" + 0.018*"enable" + 0.018*"variant"

Topic 5
0.050*"sequence" + 0.046*"author" + 0.037*"lesson" + 0.025*"language" + 0.025*"copy" + 0.025*"models" + 

## Show frequent topic words

In [31]:
import operator
from collections import defaultdict

word_freq = defaultdict(lambda: 0)
for i in range(cluster.model.num_topics):
    topic_string = cluster.model.print_topic(i)
    for prob, word in [prob_word.split('*') for prob_word in topic_string.split(' + ')]:
        word_freq[word.replace('"', '')] += 1
        
sorted(word_freq.items(), key=operator.itemgetter(1), reverse=True)

[('author', 41),
 ('use', 28),
 ('amp', 21),
 ('new', 20),
 ('ransomware', 14),
 ('learn', 14),
 ('make', 13),
 ('malware', 13),
 ('neural', 12),
 ('security', 12),
 ('datum', 12),
 ('learning', 11),
 ('code', 11),
 ('get', 11),
 ('look', 11),
 ('people', 10),
 ('year', 10),
 ('time', 9),
 ('ai', 9),
 ('base', 9),
 ('day', 8),
 ('deep', 8),
 ('write', 8),
 ('next', 7),
 ('know', 7),
 ('networks', 7),
 ('network', 7),
 ('language', 7),
 ('software', 7),
 ('tool', 7),
 ('open', 7),
 ('attention', 6),
 ('text', 6),
 ('generation', 6),
 ('machine', 6),
 ('thank', 6),
 ('really', 6),
 ('target', 5),
 ('approach', 5),
 ('update', 5),
 ('word', 5),
 ('detail', 5),
 ('service', 5),
 ('show', 5),
 ('paper', 5),
 ('research', 5),
 ('impact', 5),
 ('file', 5),
 ('another', 4),
 ('mobile', 4),
 ('deepmind', 4),
 ('translation', 4),
 ('first', 4),
 ('wannacry', 4),
 ('go', 4),
 ('find', 4),
 ('also', 4),
 ('week', 4),
 ('report', 4),
 ('cyber', 4),
 ('memory', 3),
 ('world', 3),
 ('copy', 3),
 ('sl