In [275]:
import re
import numpy as np
import pandas as pd
import os
import gensim
import gensim.corpora as corpora
import spacy
import pyLDAvis
import pyLDAvis.gensim
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
from nltk.corpus import stopwords

In [262]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [265]:
df_raw = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df_raw.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [266]:
# Convert to list
data = df_raw.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [7]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [9]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]]


In [10]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

### Прогон на базовом дженсиме:

In [11]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [12]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.097*"black" + 0.073*"wing" + 0.072*"white" + 0.048*"vote" + '
  '0.041*"flight" + 0.028*"devil" + 0.027*"capacity" + 0.024*"trace" + '
  '0.023*"assist" + 0.022*"penalty"'),
 (1,
  '0.064*"man" + 0.040*"god" + 0.039*"accept" + 0.034*"explain" + '
  '0.030*"member" + 0.029*"age" + 0.027*"israeli" + 0.027*"season" + '
  '0.024*"publish" + 0.021*"serious"'),
 (2,
  '0.130*"board" + 0.054*"expansion" + 0.032*"looking" + 0.016*"stuff_delete" '
  '+ 0.011*"rod" + 0.000*"reluctant" + 0.000*"wire" + 0.000*"connect" + '
  '0.000*"worked" + 0.000*"blanking"'),
 (3,
  '0.060*"reality" + 0.053*"picture" + 0.050*"object" + 0.042*"greek" + '
  '0.038*"contain" + 0.036*"generate" + 0.034*"interface" + 0.030*"font" + '
  '0.029*"concept" + 0.026*"workstation"'),
 (4,
  '0.215*"ax" + 0.182*"max" + 0.040*"orbit" + 0.035*"satellite" + '
  '0.034*"mission" + 0.026*"launch" + 0.022*"moon" + 0.020*"shuttle" + '
  '0.018*"spacecraft" + 0.017*"lunar"'),
 (5,
  '0.032*"kill" + 0.024*"child" + 0.023*"

### Прогон через маллет:

In [212]:
os.environ.update({'MALLET_HOME':r'C:/Users/User/Desktop/универ/автобрея/mallet-2.0.8/'})
mallet_path = r'C:\Users\User\Desktop\универ\автобрея\mallet-2.0.8\bin\mallet'

In [213]:
def do_mallet(mallet_path, corpus, num_topics, id2word, data_lemmatized, output):
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    if output == True:
        pprint(ldamallet.show_topics(formatted=True, num_topics=20))
        print('\nCoherence Score: ', coherence_ldamallet)
    return coherence_ldamallet

#### Пробный прогон на 20-ти топиках:

In [23]:
do_mallet(mallet_path, corpus, 20, id2word, data_lemmatized, output=True)

[(0,
  '0.021*"make" + 0.017*"work" + 0.016*"money" + 0.016*"year" + 0.013*"pay" + '
  '0.012*"people" + 0.012*"job" + 0.010*"government" + 0.008*"cost" + '
  '0.008*"time"'),
 (1,
  '0.014*"people" + 0.011*"man" + 0.011*"word" + 0.011*"christian" + '
  '0.011*"love" + 0.010*"church" + 0.010*"religion" + 0.010*"time" + '
  '0.009*"book" + 0.009*"make"'),
 (2,
  '0.014*"bike" + 0.010*"turn" + 0.010*"ride" + 0.010*"light" + 0.009*"ground" '
  '+ 0.008*"back" + 0.008*"wire" + 0.008*"power" + 0.008*"side" + '
  '0.008*"water"'),
 (3,
  '0.172*"ax" + 0.138*"max" + 0.057*"car" + 0.053*"line" + 0.031*"buy" + '
  '0.029*"price" + 0.027*"sell" + 0.024*"sale" + 0.015*"distribution_usa" + '
  '0.011*"interested"'),
 (4,
  '0.019*"people" + 0.018*"happen" + 0.017*"time" + 0.014*"leave" + '
  '0.013*"back" + 0.013*"start" + 0.012*"day" + 0.009*"live" + 0.009*"woman" + '
  '0.009*"home"'),
 (5,
  '0.034*"game" + 0.029*"team" + 0.029*"year" + 0.026*"play" + 0.020*"player" '
  '+ 0.018*"win" + 0.014*"

0.5449010381050419

### Вычисление оптимального числа топиков  
Функция создаёт модели в пределах задаваемого диапазона топиков, выбирает лучший Coherence score и возращает словарь всех результатов, а также отдельно оптимальное количество топиков и скор этого количества

In [214]:
def optimal_number(min_topics, max_topics):
    results = {}
    for number in range(min_topics, max_topics+1):
        score = do_mallet(mallet_path, corpus, number, id2word, data_lemmatized, output=False)
        results[number] = score
    best_number = max(results, key=results.get)
    best_score = max(results.values())
    return results, best_number, best_score    

In [215]:
# для диапазона 10-25 включительно:
output = optimal_number(10,25)

In [216]:
print('Все результаты:')
for o in output[0]:
    print(o, 'topics -- score', output[0][o])
print('Лучшее количество топиков:', output[1])
print('Скор этого количества:', output[2])

Все результаты:
10 topics -- score 0.5148061073978827
11 topics -- score 0.4992618599471466
12 topics -- score 0.5156948040184587
13 topics -- score 0.5282312848308736
14 topics -- score 0.5115733675113443
15 topics -- score 0.5316952226416727
16 topics -- score 0.5377685072500002
17 topics -- score 0.5414166468586689
18 topics -- score 0.5396985998753948
19 topics -- score 0.5418557983901511
20 topics -- score 0.5406957559477352
21 topics -- score 0.5472890517871054
22 topics -- score 0.5397009196797569
23 topics -- score 0.5297457794647642
24 topics -- score 0.5280045323061477
25 topics -- score 0.53705585712109
Лучшее количество топиков: 21
Скор этого количества: 0.5472890517871054


### Определение главного топика в текстах:

In [217]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=21, id2word=id2word)

In [218]:
# запуск на оптимальном числе топиков
all_topics = ldamallet.show_topics(formatted=False, num_topics=21)
pprint(all_topics)

[(0,
  [('power', 0.022116357055410655),
   ('ground', 0.012029594932666206),
   ('line', 0.011923138340341725),
   ('current', 0.010911800713259168),
   ('high', 0.01048597434396125),
   ('wire', 0.009581093309203172),
   ('water', 0.009128652791824133),
   ('light', 0.008889125459094054),
   ('unit', 0.008782668866769575),
   ('low', 0.008223771757066057)]),
 (1,
  [('bit', 0.029886417954612707),
   ('card', 0.023877267657569482),
   ('driver', 0.018482689549996587),
   ('line', 0.016843830378075707),
   ('file', 0.016547925249812214),
   ('program', 0.012906015978876926),
   ('memory', 0.012405253454123324),
   ('mode', 0.011312680672842738),
   ('speed', 0.011153347142239319),
   ('build', 0.0109940136116359)]),
 (2,
  [('file', 0.03245635148395697),
   ('image', 0.01964414656273722),
   ('include', 0.01834811427301846),
   ('information', 0.017255744200255503),
   ('list', 0.01457110588583807),
   ('software', 0.013812001259002796),
   ('mail', 0.013793486512006813),
   ('send', 0

In [219]:
def main_topic(all_topics, text_number):
    weights = {}
    freqs = {}
    for id, freq in corpus[text_number]: # беру уже готовые частотности
        freqs[id2word[id]] = freq
    for topic in all_topics: # для каждого из топиков
        weight = 0 # начинаем считать веса встретившихся слов конкретного топика
        for pair in topic[1]: # для каждой пары слово-коэф
            if pair[0] in freqs: # если слово есть в тексте
                weight += pair[1] * freqs[pair[0]]# коэф * кол-во этого слова в тексте
        weights[topic[0]] = weight # айди топика: все веса всех его слов
        main = [max(weights, key=weights.get), max(weights.values())]
    return main

In [220]:
pprint(main_topic(all_topics, 0)) # какой топик самый топовый в первом тексте

[19, 0.2555772146415421]


#### Создание первого датафрейма

In [339]:
def first_df():
    df = pd.DataFrame()
    for i in range(len(corpus)):
        main = main_topic(all_topics, i)
        words = ', '.join(list(map(lambda x: (x[0]), all_topics[main[0]][1])))
        items = {'№ текста': i,
                'Главный топик': main[0],
                'Слова топика': words,
                'Вес топика': main[1]}
        df = df.append(items, ignore_index=True)
    df = df.reindex(columns=['№ текста', 'Главный топик', 'Слова топика', 'Вес топика'])
    return df

In [400]:
df = first_df()
pd.set_option('display.max_colwidth', -1) 
df.head(10)

Unnamed: 0,№ текста,Главный топик,Слова топика,Вес топика
0,0.0,19.0,"car, good, buy, pay, make, write, cost, line, money, article",0.255577
1,1.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.218068
2,2.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.374354
3,3.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.848174
4,4.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.3971
5,5.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",2.045345
6,6.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.153616
7,7.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.779262
8,8.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.153616
9,9.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.783723


### Расчёт tf / idf:

In [252]:
# собирает группы текстов (по их id) с одинаковым главным топиком
groups = []
for num in range(21):
    top = df.loc[df['Главный топик'] == num]
    groups.append(top['№ текста'].tolist())

In [298]:
joined = [] # список строк из лемм = весь корпус
for d in data_lemmatized:
    joined.append(' '.join(d))

In [380]:
def get_top_tf_idf_words(vector, feature_names, top_n):
    sorted_nzs = np.argsort(vector.data)[:-(top_n+1):-1]
    return feature_names[vector.indices[sorted_nzs]]

In [478]:
def count_tfidf(group): # для одной группы
    test_n = 0
    tfidf = {}
    group_joined = [] # тут список лем-х текстов одной группы
    for g in group: 
        group_joined.append(joined[int(g)])
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(group_joined)
    feature_names = np.array(vectorizer.get_feature_names())
    for i, (g, post) in enumerate(zip(group, group_joined)):
        response = vectorizer.transform([post])
        top_words = get_top_tf_idf_words(response, feature_names, 5)
        tfidf[g] = ', '.join(top_words)
    return tfidf # словарь айди: слова

In [481]:
new_column = {}
for group in groups:
    new_column.update(count_tfidf(group)) # добавить к общему результаты одной группы
# отсортировать и преврать в список для добавления в df:
new_c_list = list(map(lambda x: (x[1]), sorted(new_column.items(), key=lambda x: x[0])))

In [482]:
df['Топ-5 tf / idf'] = new_c_list

In [484]:
df.head(10)

Unnamed: 0,№ текста,Главный топик,Слова топика,Вес топика,Топ-5 tf / idf
0,0.0,19.0,"car, good, buy, pay, make, write, cost, line, money, article",0.255577,"car, door, lerxst, bricklin, where"
1,1.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.218068,"poll, clock, upgrade, final, detailing"
2,2.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.374354,"powerbook, display, machine, bunch, store"
3,3.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.848174,"division, quadrilateral, chip, weitek, winter"
4,4.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.3971,"error, warn, bug, waivere, memory"
5,5.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",2.045345,"weapon, needless, individual, tavare, keep"
6,6.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.153616,"treatment, astrocytoma, thank, tumor, accidentally"
7,7.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.779262,"scsi, chip, range, esdi, bit"
8,8.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.153616,"icon, win, wallpaper, bmp, appreciated"
9,9.0,17.0,"write, article, line, host, organization, nntp_poste, reply, hear, opinion, news",0.783723,"board, diskdoubler, autodoubler, sigma_design, licensing"


Как видно на датафрейме, у очень большого количества текстов после подсчёта весов слов, входящих в топик, главным топиком оказался топик №17. Это объясняется тем, что в 17 топике сразу трём словам маллет определяет очень большой вес: article, write и line (см. ниже). Если хотя бы одно из этих слов встретится в тексте, то почти навверняка перевесит все остальные слова, т.к. им маллет приписал веса в разы меньшие. Если в тексте встретятся два тяжёлых слова, то у остальных топиков просто нет шансов.

In [486]:
pprint(all_topics[17]) # топик с тяжелыми словами

(17,
 [('write', 0.23300657431321906),
  ('article', 0.16409368396337168),
  ('line', 0.15361587227048604),
  ('host', 0.06445174923691008),
  ('organization', 0.059843860061047194),
  ('nntp_poste', 0.022247006339516318),
  ('reply', 0.020163183845973234),
  ('hear', 0.014175862878610002),
  ('opinion', 0.008247241136417),
  ('news', 0.00810049307349143)])


In [488]:
pprint(all_topics[15]) # пример другого топика, где тяжелых слов нет

(15,
 [('key', 0.04125268989515132),
  ('system', 0.015841765486928255),
  ('encryption', 0.01524655464493384),
  ('chip', 0.01307174579918502),
  ('public', 0.01124032782381759),
  ('government', 0.011194542374433404),
  ('bit', 0.011125864200357126),
  ('message', 0.011034293301588755),
  ('technology', 0.01091982967812829),
  ('security', 0.010690902431207362)])


### Что такое Coherence score:  
Измеряет относительную близость слов внутри одного топика. 0 - близости нет, 1 - максимальная близкость (т.е. это вообще одно и то же слово)  

Пайплайн, состоящий из:  
* Segmentation - разделение топиков  
* Probability Estimation - количественное вычисление вероятностей относительно reference corpus  
* Confirmation Measure - преобразование этих вероятностей   
* Aggregation - обобщение данных в одно финальное число   