In [None]:
import re
import pickle
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])

[tutorial i followed](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#7removeemailsandnewlinecharacters)

# Load

In [None]:
today = pd.datetime.today().date()
today_str = str(today)

In [None]:
events = pd.read_csv('/projects/2016-01-911analytics/write_data/lda_test_2020_01_22/cfd_attributed_events.csv')

In [None]:
flags = pd.read_csv('/projects/2016-01-911analytics/write_data/lda_test_2020_01_22/attributed_events_detail_descriptor_flags.csv')

In [None]:
text_columns = [
    'complaint',
    'medications',
    'symptom',
    'hxpresent_comments',
    'impression',
    'preexisting',
    'result_comments',
    'incident_type',
]

In [None]:
events[text_columns].head()

# Clean

concatentate text fields

In [None]:
text = (
    events[text_columns]
#     .sample(10000, random_state=1)
    .replace(np.nan, '')
    .astype(str)
    .apply(' '.join, axis=1)
)

 clean and tokenize

In [None]:
def clean_text_series(s):
    return (
        s
#         simple_preprocess does all these for us
#         .str.strip()
#         .str.lower()
#         .str.replace(r'[^A-Za-z0-9 ]', '')
#         .str.replace(r'\s+', ' ')
        .apply(simple_preprocess, deacc=True)
        .apply(np.array)
    )

In [None]:
text = clean_text_series(text)

In [None]:
text.iat[0]

## remove stopwords

In [None]:
stopwords_en = stopwords.words('english')

### what's the fastest way?

In [None]:
def remove_stopwords_np(t, stopwords=stopwords_en):
    return t[~np.isin(t, stopwords_en)]

In [None]:
def remove_stopwords_lc(t, stopwords=stopwords_en):
    return [w for w in t if w not in stopwords]

In [None]:
%%timeit
text.apply(remove_stopwords_np, stopwords=np.array(stopwords_en))

In [None]:
%%timeit
text.apply(remove_stopwords_lc, stopwords=set(stopwords_en))

In [None]:
remove_stopwords = remove_stopwords_np

### now back to reality

In [None]:
text = text.apply(remove_stopwords_lc, stopwords=set(stopwords_en))

# Lemmatize

In [None]:
def lemmatize(t, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    return [token.lemma_ for token in nlp(' '.join(t)) if token.pos_ in allowed_postags]

In [None]:
text = text.apply(lemmatize)

# Dictionary

In [None]:
id2word = corpora.Dictionary(text)

In [None]:
id2word.filter_extremes(no_below=100, no_above=0.5)

In [None]:
corpus = text.apply(id2word.doc2bow)

# LDA

In [None]:
n_topics = 20

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=n_topics, 
    random_state=100,
    update_every=1,
    chunksize=1000,
    passes=10,
    alpha='auto',
    per_word_topics=True,
)

In [None]:
with open(f'lda-{today_str}.pkl', 'wb') as f:
    pickle.dump(lda_model, f)

In [None]:
lda_model.print_topics()

In [None]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

In [None]:
topics = (
    pd.DataFrame(
        lda_model.show_topics(formatted=False, num_topics=n_topics)
    )
    .explode(1)
    .rename(columns={0: 'topic', 1: 'word'})
)
topics['value'] = topics.word.apply(lambda x: x[1])
topics['word'] = topics.word.apply(lambda x: x[0])
topics = topics.sort_values(['topic', 'value'], ascending=[1, 0])

In [None]:
topics.to_csv('topics.csv')

In [None]:
least_word = topics.reset_index().word.at[topics.reset_index().value.idxmin()]
text.apply(lambda x: least_word in x).sum(), least_word

In [None]:
topics['rank'] = topics.groupby('topic').agg({'value': 'rank'}).add(-11).multiply(-1).astype(int)

In [None]:
topics.pivot('topic', 'rank', 'word').reset_index().to_csv('topics_wide.csv', index=False)

- histograms of what probabilities are over all documents for each topic
- correlation matrix between existing flags and topics
- set threshold for topic 1-0
    - try also correlation with topic probability
- flags: projects/2016-01-911analytics/write_data/2016_05_01_to_2017_04_30_dob_ref_2016_05_01/CFD/cfd_attributed_events_detail_descriptor_flags.RData
- how many events are assigned to each topic?
- how many individuals are assigned to each topic?

# more

In [None]:
lda_model.show_topics(num_topics=n_topics)

In [None]:
document_topics = corpus.apply(lda_model.get_document_topics, minimum_probability=0).explode()
document_topics = pd.DataFrame({'topic': document_topics.apply(lambda x: x[0]), 'probability': document_topics.apply(lambda x: x[1])})
document_topics = document_topics.pivot(columns='topic', values='probability')

In [None]:
document_topics.head()

In [None]:
# this means we can just concat
(events.unique_id == flags.unique_id).all()

In [None]:
topic_cols = document_topics.columns

In [None]:
flag_cols = flags.columns.difference(['unique_id', 'key_case', 'month_yr'])

In [None]:
full = pd.concat([document_topics, flags[flag_cols]], axis=1).corr().reindex(index=topic_cols, columns=flag_cols)

In [None]:
fig, ax = plt.subplots(figsize=(21, 7))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(full, ax=ax, cmap=cmap, vmin=-1, vmax=1)
ax.set_title('Correlation plot of topic probability with flag indicator')
ax.set_xlabel('flag')
ax.set_ylabel('topic')
fig.savefig('heatmap.png', dpi=200)