# LDA for topic modelling

Сегодня попробуем аж 3 реализации Латентного размещения Дирихле для тематического моделирования новостных статей из публикаций газеты New-York Times

Импортируем нужные модули

In [1]:
import re
import pickl
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

Датасет вычищен от урлов. В файле около 8800 новостных статей, каждая статья начинается с новой строчки в файле

In [2]:
DATAFILE = "data/nytimes_news.txt"

In [3]:
!wc -l data/nytimes_news.txt

    8883 data/nytimes_news.txt


Сложим все тексты статей в массив

In [4]:
data = []
with open(DATAFILE) as f_in:
    for line in f_in:
        data.append(line.strip())

In [5]:
data[0][:300]

'WASHINGTON — Stellar pitching kept the Mets afloat in the first half of last season despite their offensive woes. But they cannot produce an encore of their pennant-winning season if their lineup keeps floundering while their pitching is nicked, bruised and stretched thin.“We were going to ride our '

## Запустим TfidfVectorizer, чтобы очистить датасет от слишком частых и слишком редких слов.

Попутно удалим стоп-слова

In [6]:
%%time
tf = TfidfVectorizer(analyzer='word', min_df=3, max_df=0.5, stop_words='english')

tfidf_matrix =  tf.fit_transform(data)
feature_names = tf.get_feature_names()

print(len(feature_names))

44790
CPU times: user 5.53 s, sys: 127 ms, total: 5.65 s
Wall time: 5.74 s


feature_names - список, состоящий из не слишком редких и не слишком частых слов. Он нам еще пригодится

## Зададим функцию для предобработки текста.

1) Будем разбивать предложение на слова с помощью библиотеки nltk методом word_tokenize.

2) Вычистим датасет с помощью регулярного выражения от различных чисел и пунктуации. 

3) Также будем выбрасывать слова, которые не находятся в полученном словаре.

4) Будем возвращать список слов текста

In [7]:
def clean_text(text, vocab):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) and t in vocab]
    return cleaned_text

### Токенизируем и предобработаем датасет с помощью описанной функции

In [8]:
%%time
tokenized_data = []
vocab = set(feature_names) # преобразуем список в множество. попробуйте не делать этого и посмотрите, насколько медленне происходит обработка
for text in data:
    tokenized_data.append(clean_text(text, vocab))

CPU times: user 56.2 s, sys: 578 ms, total: 56.8 s
Wall time: 58.7 s


## Попробуем поработать с gensim

In [9]:
from gensim import models, corpora, similarities

### Построим словарь корпуса
в частности dictionary хранит пару словарей token2id и id2token: отображения слова в индекс и наоборот

In [10]:
%%time
dictionary = corpora.Dictionary(tokenized_data)

CPU times: user 2.9 s, sys: 8.98 ms, total: 2.91 s
Wall time: 2.91 s


### Преобразуем корпус в формат, ожидаемый генсимовским lda. 

Каждый документ будет представлен в виде списка пар (word_id, word_count)

In [11]:
%%time
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

CPU times: user 2.92 s, sys: 90.2 ms, total: 3.01 s
Wall time: 3.12 s


Посмотрим на представление документа

In [12]:
print(corpus[20][:10])

[(12, 1), (33, 16), (37, 1), (60, 2), (61, 1), (75, 1), (99, 1), (106, 1), (127, 1), (151, 3)]


### Попробуем обучить LDA

Количество тем корпуса является гиперпараметром.

Зададим его равным 100, навскидку

In [13]:
NUM_TOPICS = 100

## Построим LDA-модель

выполним 10 проходов по корпусу

In [28]:
%%time
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=10)

  diff = np.log(self.expElogbeta)


CPU times: user 8min 50s, sys: 45.2 s, total: 9min 36s
Wall time: 5min 14s


### Модель обучается около 5 мин.

Сохраним объект модели для дальнейшей подгрузки

In [32]:
with open("data/gensim_lda.model", "wb") as f_out:
    pickle.dump(lda_model, f_out)

Подгрузим объект модели

In [14]:
with open("data/gensim_lda.model", "rb") as f_in:
    lda_model = pickle.load(f_in)

##  Посмотрим на получившиеся темы

Выведем топ значимых слов для каждой темы и оценим их интерпретируемость.

In [15]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

LDA Model:
Topic #0: 0.042*"water" + 0.015*"wine" + 0.013*"food" + 0.010*"plant" + 0.007*"flint" + 0.006*"green" + 0.006*"farmers" + 0.006*"wines" + 0.006*"products" + 0.006*"farm"
Topic #1: 0.022*"street" + 0.016*"city" + 0.014*"restaurant" + 0.014*"building" + 0.013*"hotel" + 0.013*"york" + 0.010*"east" + 0.008*"space" + 0.008*"avenue" + 0.008*"bar"
Topic #2: 0.026*"museum" + 0.012*"art" + 0.009*"music" + 0.009*"director" + 0.009*"work" + 0.009*"artists" + 0.008*"design" + 0.008*"collection" + 0.007*"center" + 0.007*"exhibition"
Topic #3: 0.053*"china" + 0.031*"chinese" + 0.020*"united" + 0.017*"states" + 0.014*"government" + 0.013*"trade" + 0.013*"american" + 0.012*"beijing" + 0.011*"foreign" + 0.011*"world"
Topic #4: 0.008*"night" + 0.008*"day" + 0.007*"morning" + 0.006*"took" + 0.006*"hours" + 0.005*"days" + 0.005*"just" + 0.005*"away" + 0.005*"later" + 0.005*"left"
Topic #5: 0.043*"sea" + 0.033*"boat" + 0.029*"island" + 0.028*"australia" + 0.027*"islands" + 0.019*"australian" + 0

Topic #53: 0.076*"japan" + 0.053*"japanese" + 0.027*"skin" + 0.022*"navy" + 0.021*"tokyo" + 0.017*"marine" + 0.014*"abe" + 0.014*"beauty" + 0.011*"rivers" + 0.011*"flower"
Topic #54: 0.138*"fashion" + 0.069*"wear" + 0.045*"men" + 0.045*"dress" + 0.029*"clothes" + 0.028*"hair" + 0.027*"wore" + 0.023*"wearing" + 0.019*"schneiderman" + 0.018*"suit"
Topic #55: 0.022*"open" + 0.019*"french" + 0.018*"tennis" + 0.018*"williams" + 0.016*"murray" + 0.015*"match" + 0.012*"grand" + 0.012*"djokovic" + 0.011*"clay" + 0.010*"play"
Topic #56: 0.049*"gay" + 0.047*"rights" + 0.031*"transgender" + 0.015*"carolina" + 0.015*"north" + 0.014*"gender" + 0.013*"civil" + 0.012*"law" + 0.011*"human" + 0.010*"discrimination"
Topic #57: 0.102*"zika" + 0.075*"virus" + 0.032*"pregnant" + 0.025*"infection" + 0.021*"mosquito" + 0.021*"infections" + 0.019*"infected" + 0.019*"cases" + 0.019*"centers" + 0.017*"disease"
Topic #58: 0.068*"family" + 0.060*"father" + 0.052*"mother" + 0.042*"son" + 0.022*"children" + 0.021*"

Видно, что лексическое ядро большинства тем вполне интерпретируемо

### Можем взять свеженькую статью и, например, найти ближайшие документы из копуса

In [16]:
text = """Democrats harnessed voter fury toward President Trump to win control of the House and capture pivotal governorships Tuesday night as liberals and moderates banded together to deliver a forceful rebuke of Mr. Trump, even as Republicans added to their Senate majority by claiming a handful of conservative-leaning seats. 
The two parties each had some big successes in the states. 
Republican governors were elected in Ohio and Florida, two important battlegrounds in Mr. Trump’s 2020 campaign calculations. 
Democrats beat Gov. Scott Walker, the Wisconsin Republican and a top target, and captured the governor’s office in Michigan — two states that Mr. Trump carried in 2016 and where the left was looking to rebound."""

Получим представление текста в виде мешка слов

In [17]:
bow = dictionary.doc2bow(clean_text(text, vocab))

### Построим индекс для поиска похожих документов с помощью обученной на корпусе модели

In [18]:
%%time 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])

  if np.issubdtype(vec.dtype, np.int):


CPU times: user 53.3 s, sys: 5.05 s, total: 58.3 s
Wall time: 31.3 s


### Найдем похожие на запрос документы 

In [19]:
similar = lda_index[lda_model[bow]]

similar = sorted(enumerate(similar), key=lambda item: -item[1]) # отсортируем по "похожести"

In [20]:
print(similar[:5]) # топ-10

[(675, 0.9728249), (268, 0.9702194), (7746, 0.96715236), (8018, 0.96648335), (2099, 0.96617776)]


### Посмотрим глазами на топ-5 самых похожих статей

In [21]:
for document_id, similarity in similar[:5]:
    print(data[document_id][:1000])
    print('---------------')

Senator Bernie Sanders said on Friday that he will vote for Hillary Clinton in the November presidential election, offering one more indication that he has given up on his own candidacy.Asked if he would cast his vote for Mrs. Clinton, the presumptive Democratic nominee, in an interview on MSNBC’s “Morning Joe” program, Mr. Sanders said, “Yes.”He added that the alternative, Donald J. Trump, would be dire for the country: “The issue right here is I’m going to do everything I can do to defeat Donald Trump. I think Trump, in so many ways, would be a disaster for this country if he were elected.”Mr. Sanders has previously signaled that he would work with Mrs. Clinton to defeat Mr. Trump, and on Wednesday he acknowledged that “it doesn’t appear” that he’ll be the Democratic nominee when the party gathers for its convention next month. But he has thus far refused to suspend his presidential bid or stop campaigning. He had a campaign rally in New York City on Thursday night, and similar event

## Попробуем теперь запустить реализацию lda в sklearn 

In [22]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

#### Выполним векторизацию выборки с помощью CountVectorizer

In [23]:
%%time
vectorizer = CountVectorizer(min_df=5, max_df=0.5, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')

data_vectorized = vectorizer.fit_transform(data)

CPU times: user 4.15 s, sys: 126 ms, total: 4.28 s
Wall time: 4.36 s


#### Запустим обучение модели

In [24]:
lda_model_sklearn = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')

In [63]:
%%time
lda_model_sklearn.fit(data_vectorized)

CPU times: user 6min 40s, sys: 1min 14s, total: 7min 54s
Wall time: 4min 44s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=100, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

Сохраним модель

In [64]:
with open("data/sklearn_lda.model", "wb") as f_out:
    pickle.dump(lda_model_sklearn, f_out)

Подгрузим модель

In [25]:
with open("data/sklearn_lda.model", "rb") as f_in:
    lda_model_sklearn = pickle.load(f_in)

#### Трансформируем векторизованную выборку

In [26]:
%%time
lda_decomposition = lda_model_sklearn.transform(data_vectorized)
print(lda_decomposition.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(8883, 100)
CPU times: user 14.1 s, sys: 1.73 s, total: 15.8 s
Wall time: 8.31 s


### Выведем наиболее важные слова для тем и оценим интерпертируемость

In [27]:
for idx, word_topic_distr in enumerate(lda_model_sklearn.components_):
    print("Topic %d:" % (idx))
    top_words_idx = word_topic_distr.argsort()[-10:][::-1]
    for word_idx in top_words_idx:
        print(vectorizer.get_feature_names()[word_idx], end=', ')
    print('\n' + '-'*100)

Topic 0:
gulbis, tivo, santander, galloway, juggled, fed, galore, banks, bank, wednesday, 
----------------------------------------------------------------------------------------------------
Topic 1:
couture, deboer, brouwer, karlsson, understudy, melker, no-look, up-close, face-offs, first-period, 
----------------------------------------------------------------------------------------------------
Topic 2:
temer, lia, fold, souza, juc, colton, renan, implicating, henrique, devouring, 
----------------------------------------------------------------------------------------------------
Topic 3:
espaillat, city, political, black, rangel, dominican, district, bronx, harlem, trump, 
----------------------------------------------------------------------------------------------------
Topic 4:
com, nytimes, briefing, mexico, weekend, posted, eastern, want, evening, morning, 
----------------------------------------------------------------------------------------------------
Topic 5:
broadway

company, bank, financial, billion, investors, money, business, million, banks, investment, 
----------------------------------------------------------------------------------------------------
Topic 43:
china, chinese, beijing, taiwan, islands, trade, mainland, asia, tsai, economic, 
----------------------------------------------------------------------------------------------------
Topic 44:
game, team, players, games, season, league, win, second, points, play, 
----------------------------------------------------------------------------------------------------
Topic 45:
seaport, creeping, fragrances, kettering, genders, indigo, noodle, solstice, skylights, enamel, 
----------------------------------------------------------------------------------------------------
Topic 46:
university, students, college, york, school, father, student, graduated, mother, met, 
----------------------------------------------------------------------------------------------------
Topic 47:
australia, coal

vaughn, leguizamo, outcast, bloodline, womb, mendelsohn, pepe, ocala, cardoso, racists, 
----------------------------------------------------------------------------------------------------
Topic 85:
north, south, nuclear, korea, kim, korean, missile, weapons, party, congress, 
----------------------------------------------------------------------------------------------------
Topic 86:
malfunction, lilies, florrick, daisies, party, pomegranates, prime, election, parliament, corbyn, 
----------------------------------------------------------------------------------------------------
Topic 87:
oil, water, energy, climate, gas, power, wine, change, plants, canada, 
----------------------------------------------------------------------------------------------------
Topic 88:
norton, nordic, frost, halibut, weigh-in, cmdr, mamet, wentworth, norse, bibi, 
----------------------------------------------------------------------------------------------------
Topic 89:
williams, open, tennis, mu

### Найдем точно так же похожие документы

Наша заметка:

In [28]:
text

'Democrats harnessed voter fury toward President Trump to win control of the House and capture pivotal governorships Tuesday night as liberals and moderates banded together to deliver a forceful rebuke of Mr. Trump, even as Republicans added to their Senate majority by claiming a handful of conservative-leaning seats. \nThe two parties each had some big successes in the states. \nRepublican governors were elected in Ohio and Florida, two important battlegrounds in Mr. Trump’s 2020 campaign calculations. \nDemocrats beat Gov. Scott Walker, the Wisconsin Republican and a top target, and captured the governor’s office in Michigan — two states that Mr. Trump carried in 2016 and where the left was looking to rebound.'

In [29]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar

In [30]:
x = lda_model_sklearn.transform(vectorizer.transform([text]))

In [31]:
similar = most_similar(x, lda_decomposition)
for document_id, similarity in similar[:5]:
    print(data[document_id][:1000])
    print('---------------')

Republican elected officials, donors and strategists grappled uncomfortably on Wednesday with the inevitability of Donald J. Trump as their presidential nominee, an unexpectedly sudden denouement that left many in a state of political paralysis and others vowing to oppose the party’s new standard-bearer.While some called for unity, many Republican leaders refrained from falling in line behind Mr. Trump, with dozens avoiding inquiries about where they stood or saying they wanted Mr. Trump to detail his policies or tone down his language first. Others tied themselves in knots as they praised and criticized Mr. Trump in a single breath, or suggested that they could abide Mr. Trump but loathed his agenda.Senator Kelly Ayotte of New Hampshire, who is in a tough re-election race, signaled that she would “support” Mr. Trump but not “endorse” him, as a spokeswoman put it, a rhetorical contortion that other Republicans repeated privately. Representative Raúl R. Labrador of Idaho, a staunch cons

# И наконец попробуем имплементацию LDA в любимом vowpal vabbit

#### Сначала надо подготовить данные для vw
он ожидает данные без меток и без namespace'ов в формате

```| word_id:word_cnt word_id:word_cnt word_id:word_cnt word_id:word_cnt ... word_id:word_cnt ```

```| word_id:word_cnt word_id:word_cnt word_id:word_cnt word_id:word_cnt ... word_id:word_cnt ```

```| word_id:word_cnt word_id:word_cnt word_id:word_cnt word_id:word_cnt ... word_id:word_cnt ```

Воспользуемся уже подготовленным корпусом corpus

In [96]:
with open("data/nytimes.vw_input", "w") as f_out:
    for doc in corpus:
        line = "| "
        for idx, cnt in doc:
            line += str(idx) + ":" +  str(cnt) + " "
        f_out.write(line + "\n")

In [107]:
!wc -l data/nytimes.vw_input

    8883 data/nytimes.vw_input


## Запустим обучение

Параметры:
- -d data/nytimes.vw_input - путь к подготовленным данным
- --lda 50 - количество тем
- --lda_alpha 0.01 - параметр распределения Дирихле для матрицы "документы-темы"
- --lda_rho 0.01 - - параметр распределения Дирихле для матрицы "темы-слова"
- -c - кэширование, так как требуется выполнить несколько проходов по выборке
- --passes 10 - колдичество проходов по выборке (эпох)
- --readable_model topics.dat - сохранение модели в формате word

In [112]:
!vw -d data/nytimes.vw_input --lda 50 --lda_alpha 0.01 --lda_rho 0.01 -c -k --passes 10 --initial_t 1 --readable_model data/topics.dat

Num weight bits = 18
learning rate = 0.5
initial_t = 1
power_t = 0.5
decay_learning_rate = 1
creating cache_file = data/nytimes.vw_input.cache
Reading datafile = data/nytimes.vw_input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
14.477282 14.477282            1            1.0     none        0      226
14.798454 15.119626            2            2.0     none        0      180
15.233010 15.667566            4            4.0     none        0      205
15.854410 16.475811            8            8.0     none        0      304
16.693473 17.532535           16           16.0     none        0      404
16.359606 16.025740           32           32.0     none        0       99
15.482548 14.605489           64           64.0     none        0      180
14.343494 13.204440          128          128.0     none        0      215
13.413153 12.482813          256          256.0     no

### Посмотрим на интерпретируемость тем

In [113]:
def top_indices(x, n):
    xs = sorted(x, reverse=True)
    indices = []
    for e in xs[:n]:
        indices.append(x.index(e))
    return indices

In [132]:
num_topics = 50
num_top_words = 20

word_topic_distr = [[] for w in range(num_topics)]
with open("data/topics.dat") as f:
    for i, line in enumerate(f):
        if i < 10:
            continue
        w = line.strip().split(' ')[1:]
        
        for t in range(num_topics):
            word_topic_distr[t].append(float(w[t]))

In [133]:
topics_words = []
for t in range(num_topics):
    print('Topic-'+str(t+1))
    top_words = [dictionary.id2token[i] for i in top_indices(word_topic_distr[t], num_top_words)]
    for word in top_words:
        print(word,  end=', ')
    print('\n' + '-'*100 + '\n')

Topic-1
companies, billion, financial, million, firm, industry, revenue, investors, capital, fund, tax, costs, investment, shares, average, management, debt, venture, earnings, income, 
----------------------------------------------------------------------------------------------------

Topic-2
shot, age, movie, shooting, shots, open, broad, song, completed, champion, connection, closing, items, receiving, harvard, managers, alexander, label, jordan, achieved, 
----------------------------------------------------------------------------------------------------

Topic-3
race, runs, restaurant, innings, alex, sixth, expectations, veteran, birth, stuff, pitched, dining, zika, plate, menu, veterans, falls, truly, sweep, virus, 
----------------------------------------------------------------------------------------------------

Topic-4
percent, oil, brooklyn, quarter, energy, prices, nation, housing, results, homes, cents, data, markets, sale, survey, heavily, low, points, compared, buyers