In [1]:
import gensim
import pandas as pd
import os
from matplotlib import pyplot as plt
import plotly.graph_objects as go
%matplotlib inline

In [2]:
dir_in = 'txt_cut'
train = []
for filename in sorted(os.listdir(dir_in)):
    with open(os.path.join(dir_in, filename), 'r') as book:
        train.append(book.read())

In [3]:
all_words = pd.Series(train).str.split(expand=True).unstack().value_counts()

In [2]:
data = [go.Bar(
            x = all_words.index.values[:50],
            y = all_words.values[:50],
            marker= dict(colorscale='Jet',
                         color = all_words.values[:100]
                        ),    
    )]

layout = go.Layout(
    title='Top 50 (Uncleaned) Word frequencies in the training dataset',
)

fig = go.Figure(data=data, layout=layout)

fig

NameError: name 'go' is not defined

In [22]:
import json
from tqdm import tqdm
dir_in = 'json_cut'
train = []
for filename in tqdm(sorted(os.listdir(dir_in))):
    with open(os.path.join(dir_in, filename), 'r') as book:
        content = book.read()
        lines = [json.loads(l) for l in content.splitlines()]
        txt=[]
        for line in lines:
            if 'analysis' not in line:
                continue
            if not line['analysis']:
                continue
            if 'lex' not in line['analysis'][0]:
                continue
            txt.append(line['analysis'][0]['lex'])
        train.append(' '.join(txt))

100%|██████████| 621/621 [00:58<00:00, 12.04it/s]


In [14]:
from collections import Counter
from scipy.misc import imread
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt

In [23]:
cv = CountVectorizer(min_df=2, max_df=0.95)

In [24]:
train = cv.fit_transform(train)

In [25]:
train[0]

<1x49201 sparse matrix of type '<class 'numpy.int64'>'
	with 2890 stored elements in Compressed Sparse Row format>

In [123]:
lda = LatentDirichletAllocation(max_iter=1,
                                learning_method = 'batch',
                                random_state = 17,
                                )

In [124]:
lda

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=1, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=17, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [125]:
params = {
    'topic_word_prior': [1, 0.1, 0.01, 0.001],
    'n_components': [20, 40, 60],
    'doc_topic_prior': [1, 0.1, 0.01, 0.001],
}
model = GridSearchCV(lda, param_grid=params, n_jobs=4, verbose=10)

In [130]:
%%time
for t in [1, 0.1, 0.01, 0.001]:
    for d in [1, 0.1, 0.01, 0.001]:
        for n in [20, 40, 60]:
            lda = LatentDirichletAllocation(
                topic_word_prior=t,
                doc_topic_prior=d,
                n_components=n,
                max_iter=50,
                learning_method = 'batch',
                random_state = 17,
            )
            lda.fit(train)
            print(n, t, d, 'perplexity=', lda.perplexity(train))


20 1 1 perplexity= 9470.213832620531
40 1 1 perplexity= 9730.58910622569
60 1 1 perplexity= 9959.24769963195
20 1 0.1 perplexity= 9292.719680753984
40 1 0.1 perplexity= 9389.887995599804
60 1 0.1 perplexity= 9437.91470417996
20 1 0.01 perplexity= 9272.881091502251
40 1 0.01 perplexity= 9344.664575070105
60 1 0.01 perplexity= 9364.79597346844
20 1 0.001 perplexity= 9276.462398416073
40 1 0.001 perplexity= 9345.049419366956
60 1 0.001 perplexity= 9360.945048031941
20 0.1 1 perplexity= 8794.812755317493
40 0.1 1 perplexity= 8973.74558968059
60 0.1 1 perplexity= 9081.8101420611
20 0.1 0.1 perplexity= 8696.108903145452
40 0.1 0.1 perplexity= 8714.187116619183
60 0.1 0.1 perplexity= 8651.237034164422
20 0.1 0.01 perplexity= 8693.158066188922
40 0.1 0.01 perplexity= 8693.345812661584
60 0.1 0.01 perplexity= 8608.38390812501
20 0.1 0.001 perplexity= 8708.452392310017
40 0.1 0.001 perplexity= 8707.425304464405
60 0.1 0.001 perplexity= 8621.806067006513
20 0.01 1 perplexity= 9200.537397459328
40

In [29]:
# Define helper function to print top words
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [38]:
train

<621x49201 sparse matrix of type '<class 'numpy.int64'>'
	with 1382939 stored elements in Compressed Sparse Row format>

In [95]:
n_top_words = 10
print("\nTopics in LDA model: ")
tf_feature_names = cv.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model: 

Topic #0:художник захар лось григорий павлович гек чук мать абрамович станция

Topic #1:шубин елена назарыч степанович завод доктор берсенев площадь лев толстяк

Topic #2:любить друг ваш сердце потому отец молодой казаться мать несколько

Topic #3:василий кузнецов зоя юлия вагон снег бессонов товарищ лейтенант нечаев

Topic #4:андрей петька доктор филиппов люба лазик захарка виктор левша мальчишка

Topic #5:король митя принц ну принцесса министр колпак невидимка вильгельм ваш

Topic #6:егор артем лева кролик иванович ну ленька удав монах ваня

Topic #7:земля черный белый ночь вода лес дорога небо солнце гора

Topic #8:сережа мишка львовна яйцо ну инкубатор цыпленок ребята буба ведь

Topic #9:глеб фомич машина живой город рубчик дед братик генерал кудрявцев

Topic #10:алеша маша кот васька волшебник дмитриевич иван валька дядя капитан

Topic #11:степан настасья танюшка барин шкатулка камень ко вовсе женщина конечно

Topic #12:незнайка знайка гошка зорин ну короты

In [72]:
import numpy as np

In [96]:
distr = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

In [101]:
distr[0].sort()

In [104]:
(distr[0] > 1e-3).sum()

119

In [79]:
lda.perplexity(train)

8861.886568944414

In [80]:
lda.score(train)

-25446587.941832397

In [34]:
# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn