this notebook is designed to train lda models to classify events from text preprocessing to comparing different lda models and choosing the best model.

In [1]:
from pymystem3 import Mystem
my_stem = Mystem()

import nltk
import logging

try:
    nltk.corpus.stopwords.words('english')
except:
    nltk.download('stopwords')

import pandas as pd
from collections import defaultdict

import gensim.models

MODEL_NAME = 'lda'
DATA_PATH = '/data/'
EVENTS_PATH = f'{DATA_PATH}events/'
RAW_PATH = f'/{EVENTS_PATH}/raw/'
DOCS_PATH = f'/{EVENTS_PATH}/lda/'
MODEL_PATH = f'{DATA_PATH}models/{MODEL_NAME}/' 
TMP_PATH = f'{DATA_PATH}tmp/{MODEL_NAME}/'

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

samples = range(5, 102) # number of topics for lda models

text preprocessing:
<ol>
    <li>lowercasing </li>
    <li>removing all non alphanum symbols </li>
    <li>removing all numbers </li>
    <li>lemmetazing of text </li>
    <li>removing useless space symbols </li>
</ol>

In [None]:
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum


useless_symbols = set(['_', 'ー'])
filter_words = set()
# load stopwords from nltk
filter_words = filter_words.union(set(stopwords.words('english')))
filter_words = filter_words.union(set(stopwords.words('russian')))
# load own stopwords
with open('stopwords_full.txt') as f:
    filter_words = filter_words.union(set([word for line in f for word in line.split()]))


# input: document: str
# output: document: str
# make preprocessing of documents, remove useless symbols, remove useless words
def preprocessor(s: str):
    # all letter to lower case
    s = s.lower() 
    
    # remove all symbol, which aren't letter or num
    s = strip_non_alphanum(s)
    # strip_non_alphanum don't remove '_' symbol
    s = ''.join(map(lambda c: ' ' if c in useless_symbols else c, s))

    s = ''.join(my_stem.lemmatize(s))
    # replace all space symbol like space, tab, \n to simple space and remove double and for space
    s = strip_multiple_whitespaces(s)
    # remove empty words and words from filtered words 
    s = ' '.join(list(filter(lambda w:  len(w) > 0 and not (w in filter_words or w.isdigit()), s.split(' '))))
    return s


# load file and drop documents with empty text
df = pd.concat([pd.read_csv(RAW_PATH + 'spb_events.csv'), pd.read_csv(RAW_PATH + 'moscow_events.csv')])
#df = pd.read_csv(RAW_PATH + 'spb_events.csv')

# preprocess of corpus
df['description'] = df['captions'].apply(preprocessor)

# save results
!mkdir {DOCS_PATH}/events_lda
df.to_csv(r'' + DOCS_PATH + 'events_norm.csv', index=False)

<ul>
    <li>filtering stopwords</li>
    <li>generation bigrams and trigrams </li>
    <li>generation dictionary and corpus for training</li>
</ul>
Note: stopwords are combining various particles, prepositions of conjunctions and other words that have little meaning in this task and contextual stop words - words with a high level of overlap in different topics ('спб', 'love', 'likes4likes'). Сontextual stopwords were obtained after several training cycles and filtering useless words. 
<br><br>
Note: this code must be executed after restarting the notebook 

In [17]:
from gensim.models import Phrases
from gensim.corpora import Dictionary

with open('../stopwords/stopwords_context.txt') as f:
    filter_words = set([word for line in f for word in line.split()])


df = pd.read_csv(DOCS_PATH + 'events_norm.csv').dropna()
df['description'] = df['description'].apply(lambda s: ' '.join(list(filter(lambda w: not (w in filter_words), s.split(' ')))))
docs = list(map(lambda s: s.split(), df['description'].tolist()))

bigram = Phrases(docs, min_count=5)
trigram = Phrases(bigram[docs])  

for idx in range(len(docs)):
    for token in trigram[bigram[docs[idx]]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)
            
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 10% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.1)
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
temp = dictionary[0] # it needed for working of lda training

2020-07-16 14:31:22,875 : INFO : collecting all words and their counts
2020-07-16 14:31:22,877 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-07-16 14:31:29,524 : INFO : PROGRESS: at sentence #10000, processed 3713347 words and 2697503 word types
2020-07-16 14:31:30,364 : INFO : collected 2938062 word types from a corpus of 4105782 words (unigram + bigrams) and 10214 sentences
2020-07-16 14:31:30,365 : INFO : using 2938062 counts as vocab in Phrases<0 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>
2020-07-16 14:31:30,366 : INFO : collecting all words and their counts
2020-07-16 14:31:30,367 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2020-07-16 14:31:49,296 : INFO : PROGRESS: at sentence #10000, processed 3173219 words and 2857562 word types
2020-07-16 14:31:51,341 : INFO : collected 3114433 word types from a corpus of 3510642 words (unigram + bigrams) and 10214 sentences
2020-07-16 14:31:51,342 : INFO : using 3114433 

training list of models. Samples - different number of topics. For each sample is trained one lda model.

In [None]:
from gensim.models import LdaMulticore

model_list = []
samples = range(5, 102)
!mkdir {MODEL_PATH}

for n_topics in samples:
    path = f'{MODEL_PATH}{n_topics}/'
    !mkdir {path}
    
    chunksize = len(df)
    passes = 10
    iterations = 100

    model = LdaMulticore(
        workers=35,
        corpus=corpus,
        id2word=dictionary.id2token,
        chunksize=chunksize,
        eta='auto',
        iterations=iterations,
        num_topics=n_topics,
        passes=passes,
        eval_every=None
    )
    model.save(path + "mdl")
    model_list.append(model)

if models exist and are trained, we can load them

In [2]:
from gensim.models import LdaMulticore

model_list = []

for n_topics in samples:
    path = f'{MODEL_PATH}{n_topics}/mdl'
    model_list.append(LdaMulticore.load(path))


2020-07-17 09:51:19,333 : INFO : loading LdaMulticore object from /data/models/lda/5/mdl
2020-07-17 09:51:19,335 : INFO : loading expElogbeta from /data/models/lda/5/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:19,337 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:19,337 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:19,338 : INFO : setting ignored attribute state to None
2020-07-17 09:51:19,338 : INFO : loaded /data/models/lda/5/mdl
2020-07-17 09:51:19,338 : INFO : loading LdaState object from /data/models/lda/5/mdl.state
2020-07-17 09:51:19,342 : INFO : loaded /data/models/lda/5/mdl.state
2020-07-17 09:51:19,349 : INFO : loading LdaMulticore object from /data/models/lda/6/mdl
2020-07-17 09:51:19,351 : INFO : loading expElogbeta from /data/models/lda/6/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:19,352 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:19,352 : INFO : setting ignored attribute id2word to N

2020-07-17 09:51:19,541 : INFO : loaded /data/models/lda/17/mdl
2020-07-17 09:51:19,542 : INFO : loading LdaState object from /data/models/lda/17/mdl.state
2020-07-17 09:51:19,551 : INFO : loaded /data/models/lda/17/mdl.state
2020-07-17 09:51:19,557 : INFO : loading LdaMulticore object from /data/models/lda/18/mdl
2020-07-17 09:51:19,559 : INFO : loading expElogbeta from /data/models/lda/18/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:19,560 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:19,561 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:19,561 : INFO : setting ignored attribute state to None
2020-07-17 09:51:19,561 : INFO : loaded /data/models/lda/18/mdl
2020-07-17 09:51:19,562 : INFO : loading LdaState object from /data/models/lda/18/mdl.state
2020-07-17 09:51:19,571 : INFO : loaded /data/models/lda/18/mdl.state
2020-07-17 09:51:19,578 : INFO : loading LdaMulticore object from /data/models/lda/19/mdl
2020-07-17 09:51:19,579 : INF

2020-07-17 09:51:19,849 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:19,849 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:19,849 : INFO : setting ignored attribute state to None
2020-07-17 09:51:19,850 : INFO : loaded /data/models/lda/30/mdl
2020-07-17 09:51:19,850 : INFO : loading LdaState object from /data/models/lda/30/mdl.state
2020-07-17 09:51:19,867 : INFO : loaded /data/models/lda/30/mdl.state
2020-07-17 09:51:19,873 : INFO : loading LdaMulticore object from /data/models/lda/31/mdl
2020-07-17 09:51:19,874 : INFO : loading expElogbeta from /data/models/lda/31/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:19,877 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:19,877 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:19,877 : INFO : setting ignored attribute state to None
2020-07-17 09:51:19,878 : INFO : loaded /data/models/lda/31/mdl
2020-07-17 09:51:19,878 : INFO : loading LdaState objec

2020-07-17 09:51:20,237 : INFO : loaded /data/models/lda/42/mdl.state
2020-07-17 09:51:20,243 : INFO : loading LdaMulticore object from /data/models/lda/43/mdl
2020-07-17 09:51:20,245 : INFO : loading expElogbeta from /data/models/lda/43/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:20,247 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:20,247 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:20,248 : INFO : setting ignored attribute state to None
2020-07-17 09:51:20,248 : INFO : loaded /data/models/lda/43/mdl
2020-07-17 09:51:20,248 : INFO : loading LdaState object from /data/models/lda/43/mdl.state
2020-07-17 09:51:20,271 : INFO : loaded /data/models/lda/43/mdl.state
2020-07-17 09:51:20,278 : INFO : loading LdaMulticore object from /data/models/lda/44/mdl
2020-07-17 09:51:20,279 : INFO : loading expElogbeta from /data/models/lda/44/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:20,281 : INFO : setting ignored attribute dispatcher to

2020-07-17 09:51:20,691 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:20,691 : INFO : setting ignored attribute state to None
2020-07-17 09:51:20,692 : INFO : loaded /data/models/lda/55/mdl
2020-07-17 09:51:20,692 : INFO : loading LdaState object from /data/models/lda/55/mdl.state
2020-07-17 09:51:20,721 : INFO : loaded /data/models/lda/55/mdl.state
2020-07-17 09:51:20,727 : INFO : loading LdaMulticore object from /data/models/lda/56/mdl
2020-07-17 09:51:20,729 : INFO : loading expElogbeta from /data/models/lda/56/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:20,731 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:20,732 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:20,732 : INFO : setting ignored attribute state to None
2020-07-17 09:51:20,732 : INFO : loaded /data/models/lda/56/mdl
2020-07-17 09:51:20,733 : INFO : loading LdaState object from /data/models/lda/56/mdl.state
2020-07-17 09:51:20,761 : INFO : loaded /

2020-07-17 09:51:21,250 : INFO : loading LdaMulticore object from /data/models/lda/68/mdl
2020-07-17 09:51:21,252 : INFO : loading expElogbeta from /data/models/lda/68/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:21,255 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:21,256 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:21,256 : INFO : setting ignored attribute state to None
2020-07-17 09:51:21,256 : INFO : loaded /data/models/lda/68/mdl
2020-07-17 09:51:21,257 : INFO : loading LdaState object from /data/models/lda/68/mdl.state
2020-07-17 09:51:21,292 : INFO : loaded /data/models/lda/68/mdl.state
2020-07-17 09:51:21,299 : INFO : loading LdaMulticore object from /data/models/lda/69/mdl
2020-07-17 09:51:21,300 : INFO : loading expElogbeta from /data/models/lda/69/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:21,303 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:21,303 : INFO : setting ignored attribute id2wo

2020-07-17 09:51:21,851 : INFO : setting ignored attribute state to None
2020-07-17 09:51:21,852 : INFO : loaded /data/models/lda/80/mdl
2020-07-17 09:51:21,852 : INFO : loading LdaState object from /data/models/lda/80/mdl.state
2020-07-17 09:51:21,892 : INFO : loaded /data/models/lda/80/mdl.state
2020-07-17 09:51:21,899 : INFO : loading LdaMulticore object from /data/models/lda/81/mdl
2020-07-17 09:51:21,900 : INFO : loading expElogbeta from /data/models/lda/81/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:21,903 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:21,904 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:21,904 : INFO : setting ignored attribute state to None
2020-07-17 09:51:21,904 : INFO : loaded /data/models/lda/81/mdl
2020-07-17 09:51:21,905 : INFO : loading LdaState object from /data/models/lda/81/mdl.state
2020-07-17 09:51:21,945 : INFO : loaded /data/models/lda/81/mdl.state
2020-07-17 09:51:21,952 : INFO : loading LdaMu

2020-07-17 09:51:22,571 : INFO : loading expElogbeta from /data/models/lda/93/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:22,574 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:22,575 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:22,575 : INFO : setting ignored attribute state to None
2020-07-17 09:51:22,575 : INFO : loaded /data/models/lda/93/mdl
2020-07-17 09:51:22,576 : INFO : loading LdaState object from /data/models/lda/93/mdl.state
2020-07-17 09:51:22,621 : INFO : loaded /data/models/lda/93/mdl.state
2020-07-17 09:51:22,627 : INFO : loading LdaMulticore object from /data/models/lda/94/mdl
2020-07-17 09:51:22,629 : INFO : loading expElogbeta from /data/models/lda/94/mdl.expElogbeta.npy with mmap=None
2020-07-17 09:51:22,632 : INFO : setting ignored attribute dispatcher to None
2020-07-17 09:51:22,633 : INFO : setting ignored attribute id2word to None
2020-07-17 09:51:22,633 : INFO : setting ignored attribute state to None
2020-0

calculating Coherence score for evaluating quality of lda models and choosing the best model

In [33]:
from gensim.models import CoherenceModel

def compute_coherence_values(dictionary, texts, models):
    coherence_values = []
    for model in models:
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return coherence_values

coherence_values = compute_coherence_values(dictionary, docs, model_list)

In [31]:
import plotly.graph_objects as go

x = list(map(lambda m: m.num_topics, model_list))
ind = list(range(len(df_scores)))

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=df_scores['coherence'], name='coherence', text=ind))

fig.show()

defining functions and varriables for evaluating pairbased f-score for lda models

In [38]:
df_cross = pd.read_csv('../cross_valid_union.csv')
lname = {'positive': 2, 'negative': 1}

def prepare_labels(model, df, corpus):
    #print(len(df), len(corpus))
    events = []
    for doc_ind in range(len(corpus)):
        topics = model.get_document_topics(corpus[doc_ind])
        best_topic = 0
        for topic_ind in range(len(topics)):
            if topics[best_topic][1] < topics[topic_ind][1]:
                best_topic = topic_ind
        if len(topics) == 0:
            events.append((df.iloc[doc_ind]['id'], -1))
        else:
            events.append((df.iloc[doc_ind]['id'], topics[best_topic][0]))
    return events

def evaluate_score(events):
    tp, tn, fp, fn = 0, 0, 0, 0
    d = {}
    for event, l in events:
        d[event] = l
    
    for _, row in df_cross.iterrows():
        a = row['id_a']
        b = row['id_b']
        l = row['label']
        
        tp += 1 if d[a] == d[b] and l == lname['positive'] else 0
        tn += 1 if d[a] != d[b] and l == lname['negative'] else 0
        fp += 1 if d[a] == d[b] and l == lname['negative'] else 0
        fn += 1 if d[a] != d[b] and l == lname['positive'] else 0
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall)    
    rand = (tp + tn) / (tp + tn + fp + fn)
    return [precision, recall, f1, rand, tp, tn, fp, fn]

evaluating f-score

In [39]:
import pandas as pd
scores = []

for ind, model in enumerate(model_list):
    events = prepare_labels(model, df, corpus)
    score = evaluate_score(events)
    scores.append([model.num_topics] + score)
    
df_scores = pd.DataFrame(scores, columns = ['n_topics', 'precision', 'recall', 'f1', 'rand', 'tp', 'tn', 'fp', 'fn'])

In [40]:
df_scores['coherence'] = coherence_values

In [49]:
!mkdir {TMP_PATH}
df_scores.to_csv(r'' + TMP_PATH + 'scores.csv')

mkdir: cannot create directory ‘/data/tmp/lda/’: File exists


In [3]:
df_scores = pd.read_csv(TMP_PATH + 'scores.csv')

finging the best model

In [4]:
best_score = df_scores.iloc[df_scores['f1'].idxmax()]
#best = (best_score['f1'], best_score['n_topics'])
best = (best_score['f1'], best_score['n_topics'])
best

(0.6187961985216472, 14.0)

In [6]:
def find_best_by(param_name: str):
    best_score = df_scores.iloc[df_scores[param_name].idxmax()]
    return (best_score[param_name], best_score['n_topics'])

In [7]:
find_best_by('f1')

(0.6187961985216472, 14.0)

In [8]:
find_best_by('coherence')

(0.5844902736327479, 101.0)

plotting metrics for models

In [15]:
import plotly.graph_objects as go

x = df_scores['n_topics']
fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=df_scores['precision'], name='precision'))
fig.add_trace(go.Scatter(x=x, y=df_scores['recall'], name='recall'))
fig.add_trace(go.Scatter(x=x, y=df_scores['f1'], name='f1'))

fig.update_layout(
    autosize=False,
    width=800,
    height=500
)
fig.show()

fig2 = go.Figure()
fig2.add_trace(go.Scatter(x=x, y=df_scores['coherence'], name='coherence'))

fig2.update_layout(
    autosize=False,
    width=800,
    height=500
)
fig2.show()


function for drawing a lda model

In [15]:
from pyLDAvis import gensim as gensim_lda
import pyLDAvis

def plot_lda(num_topics):
    ind = list(map(lambda m: m.num_topics, model_list)).index(num_topics)
    pyLDAvis.enable_notebook()
    return gensim_lda.prepare(model_list[ind], corpus, dictionary)

In [19]:
plot_lda(66)

2020-07-16 14:33:09,749 : INFO : Note: NumExpr detected 40 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-07-16 14:33:09,751 : INFO : NumExpr defaulting to 8 threads.

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [32]:
plot_lda(24)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [21]:
plot_lda(14)


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [36]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:50% !important; }</style>"))
