In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [104]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import LdaModel
from gensim.models import Phrases
from itertools import chain
from gensim.corpora import Dictionary

In [137]:
train = pd.read_csv('../data/external/kaggle/train.csv')
test = pd.read_csv('../data/external/kaggle/test.csv')
tokenizer = RegexpTokenizer(r'\w+')

In [100]:
def preview_corpus(corpus):
    for i,doc in enumerate(corpus[:5],1):
        print(f'Document {i}: {doc}')
        print()

In [112]:
corpus = train['text'].values
corpus = [doc.lower() for doc in corpus]
corpus = [tokenizer.tokenize(doc) for doc in corpus]
corpus = [[token for token in doc if (not token.isnumeric() and len(token) > 1)] for doc in corpus]
preview_corpus(corpus)

Document 1: ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']

Document 2: ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']

Document 3: ['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']

Document 4: ['people', 'receive', 'wildfires', 'evacuation', 'orders', 'in', 'california']

Document 5: ['just', 'got', 'sent', 'this', 'photo', 'from', 'ruby', 'alaska', 'as', 'smoke', 'from', 'wildfires', 'pours', 'into', 'school']



In [113]:
%%time

bigram = Phrases(corpus, min_count=20)
trigram = Phrases(bigram[corpus], min_count=10)
fourgram = Phrases(trigram[corpus], min_count=10)
for doc in corpus:
    bigrams = [b for b in bigram[doc] if b.count('_') == 1]
    trigrams = [t for t in trigram[bigram[doc]] if t.count('_') == 2]
    fourgrams = [f for f in fourgram[trigram[bigram[doc]]] if f.count('_') == 3]
    doc.extend(list(chain(*[bigrams, trigrams, fourgrams])))
preview_corpus(corpus)

Document 1: ['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']

Document 2: ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']

Document 3: ['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']

Document 4: ['people', 'receive', 'wildfires', 'evacuation', 'orders', 'in', 'california']

Document 5: ['just', 'got', 'sent', 'this', 'photo', 'from', 'ruby', 'alaska', 'as', 'smoke', 'from', 'wildfires', 'pours', 'into', 'school']

CPU times: user 4.34 s, sys: 2.96 ms, total: 4.35 s
Wall time: 4.35 s


In [114]:
dictionary = Dictionary(corpus)
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [115]:
corpus = [dictionary.doc2bow(doc) for doc in corpus]

In [116]:
preview_corpus(corpus)

Document 1: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]

Document 2: [(10, 1), (11, 1), (12, 1), (13, 1)]

Document 3: [(0, 1), (1, 2), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1)]

Document 4: [(16, 1), (17, 1), (23, 1), (24, 1)]

Document 5: [(8, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]



In [108]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 3677
Number of documents: 7613


In [119]:
%%time
num_topics = 5
chunksize = 1000
passes = 1
iterations = 400
eval_every = None

temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

CPU times: user 2.81 s, sys: 0 ns, total: 2.81 s
Wall time: 2.81 s


In [120]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -4.5080.
[([(0.053781196, 'the'),
   (0.039594475, 'you'),
   (0.033763558, 'to'),
   (0.0305492, 'and'),
   (0.02576075, 'that'),
   (0.023221346, 'my'),
   (0.02148747, 'of'),
   (0.01904881, 'it'),
   (0.0189762, 'be'),
   (0.01846861, 'have'),
   (0.018052092, 'is'),
   (0.01219985, 'if'),
   (0.011694165, 'me'),
   (0.011619976, 'but'),
   (0.011431983, 'will'),
   (0.0114151845, 'so'),
   (0.011187858, 'with'),
   (0.010726236, 'for'),
   (0.010488652, 'not'),
   (0.0103229275, 'your')],
  -2.4843459018751357),
 ([(0.05674868, 'the'),
   (0.03135121, 'in'),
   (0.03087665, 'to'),
   (0.023665862, 'of'),
   (0.022444809, 'and'),
   (0.017965, 'it'),
   (0.01456986, 'wreck'),
   (0.013613525, 'for'),
   (0.012887438, 'this'),
   (0.012196167, 'was'),
   (0.011660811, 'like'),
   (0.011472275, 'wounded'),
   (0.010971312, 'http'),
   (0.010756469, 'my'),
   (0.009790454, 'http_co'),
   (0.0095378095, 'fires'),
   (0.009392059, 'disaster'),
   (0.009186263, '

In [136]:
%%time
topics = [model.get_document_topics(doc) for doc in corpus]
topics = [dict(topic_dist) for topic_dist in topics]
topics = pd.DataFrame(topics).fillna(0)
topics

CPU times: user 2.44 s, sys: 60.3 ms, total: 2.5 s
Wall time: 2.42 s


Unnamed: 0,0,1,2,3,4
0,0.013006,0.010942,0.012808,0.942703,0.020542
1,0.029287,0.024716,0.028954,0.350792,0.566252
2,0.000000,0.000000,0.000000,0.010233,0.962967
3,0.029348,0.024729,0.317562,0.031643,0.596717
4,0.197301,0.292873,0.012839,0.475931,0.021056
...,...,...,...,...,...
7608,0.015923,0.013436,0.780345,0.017090,0.173208
7609,0.000000,0.000000,0.000000,0.633804,0.345121
7610,0.029283,0.024716,0.029074,0.031476,0.885451
7611,0.012984,0.938424,0.012857,0.013970,0.021765
