In [128]:
import nltk; nltk.download('stopwords')
!python3 -m spacy download en
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
import spacy
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorize = TfidfVectorizer(use_idf=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [3]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [4]:
data = df.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
pprint(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: '
 'rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: '
 '15 I was wondering if anyone out there could enlighten me on this car I saw '
 'the other day. It was a 2-door sports car, looked to be from the late 60s/ '
 'early 70s. It was called a Bricklin. The doors were really small. In '
 'addition, the front bumper was separate from the rest of the body. This is '
 'all I know. If anyone can tellme a model name, engine specs, years of '
 'production, where this car is made, history, or whatever info you have on '
 'this funky looking car, please e-mail. Thanks, - IL ---- brought to you by '
 'your neighborhood Lerxst ---- ']


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
data_words = list(sent_to_words(data))
print(data_words[:1])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']]


In [6]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
print(trigram_mod[bigram_mod[data_words[0]]])



['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


In [7]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

[['where', 'thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wonder', 'could', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'late', 'early', 'call', 'bricklin', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [9]:
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 5), (6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1)]]


In [10]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

In [11]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [12]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.051*"report" + 0.027*"black" + 0.020*"fire" + 0.020*"white" + '
  '0.016*"trial" + 0.016*"cover" + 0.015*"medium" + 0.013*"vote" + '
  '0.012*"minor" + 0.012*"title"'),
 (1,
  '0.021*"god" + 0.020*"accept" + 0.016*"member" + 0.015*"man" + '
  '0.014*"israeli" + 0.014*"season" + 0.012*"publish" + 0.012*"lebanese" + '
  '0.012*"jewish" + 0.011*"brain"'),
 (2,
  '0.017*"package" + 0.016*"press" + 0.015*"item" + 0.015*"break" + '
  '0.011*"level" + 0.010*"edge" + 0.009*"hole" + 0.007*"eye" + '
  '0.007*"equipment" + 0.007*"contribute"'),
 (3,
  '0.025*"pc" + 0.022*"contain" + 0.020*"input" + 0.020*"reality" + '
  '0.017*"picture" + 0.016*"object" + 0.016*"level" + 0.015*"box" + '
  '0.015*"quality" + 0.013*"greek"'),
 (4,
  '0.089*"ax" + 0.076*"max" + 0.032*"space" + 0.021*"launch" + 0.018*"di_di" + '
  '0.017*"orbit" + 0.016*"sphere" + 0.015*"satellite" + 0.014*"plane" + '
  '0.014*"mission"'),
 (5,
  '0.019*"people" + 0.017*"kill" + 0.015*"child" + 0.015*"government" + '
  '0.0

In [13]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -8.348722848762439


In [14]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4392813747423439


In [None]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

In [None]:
mallet_path = '/content/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)
pprint(ldamallet.show_topics(formatted=False))
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()

In [17]:
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.5364273910935377


In [18]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [19]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=10)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [39]:
limit=40; start=2; step=10;
all_topics = range(start, limit, step)
max = 0
optimal = 0
for i in range(len(all_topics)):
  if coherence_values[i]>max:
    max = coherence_values[i]
    optimal = all_topics[i]
optimal

32

In [41]:
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=optimal, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [94]:
def main_topic(sent):  
  topic_num = {}
  for i in model.show_topics(formatted=False, num_topics=optimal):
    for word in sent:
      if word in dict(i[1]).keys():
        if i[0] in topic_num:
          topic_num[i[0]] += dict(i[1])[word]
        else:
          topic_num[i[0]] = dict(i[1])[word]
  top = Counter(topic_num).most_common(1)[0][0]
  return top

In [98]:
main_topics = {}
for text in texts:
  try: 
    main_topics[tuple(text)] = main_topic(text)
  except IndexError:
    main_topics[tuple(text)] = ''

In [115]:
df1 = pd.DataFrame()
df1['text'] = list(main_topics.keys())
df1['topic'] = list(main_topics.values())
df1

Unnamed: 0,text,topic
0,"(where, thing, car, nntp_poste, host, park, line, wonder, could, enlighten, car, see, day, door, sport, car, look, late, early, call, bricklin, door, really, small, addition, separate, rest, body, know, tellme, model, name, engine, year, production, car, make, history, info, funky, look, car, mail, thank, bring, neighborhood, lerxst)",16
1,"(poll, final, call, summary, final, call, clock, report, keyword, acceleration, line, host, fair, number, brave, soul, upgrade, clock, oscillator, share, experience, poll, send, brief, message, detailing, experience, procedure, top, speed, attain, cpu, rate, speed, add, card, hour, usage, day, functionality, floppy, especially, request, summarize, next, day, add, network, knowledge, base, do, clock, upgrade, answer, poll, thank)",16
2,"(engineering, computer, network, distribution_usa, line, well, folk, finally, give, ghost, weekend, start, life, way, back, be, market, new, machine, bit, sooner, intend, be, look, pick, powerbook, maybe, bunch, question, hopefully, answer, know, dirt, powerbook, introduction, expect, would, hear, suppose, make, appearence, summer, hear, anymore, since, access, macleak, wonder, info, hear, rumor, price, drop, line, one, duos, go, recently, s, impression, display, could, probably, swing, get, disk, rather, really, feel, much, well, display, yea, look, great, store, really, good, could, solicit, opinion, people, day, worth, take, disk, size, money, hit, get, active, display, realize, real, subjective, question, have, play, machine, computer, ...)",16
3,"(division, line, host, write, write, article, know, chip, far, stuff, go, look, pretty, nice, get, quadrilateral, fill, command, require, point, weitek, address, phone, number, would, information, chip, division, thing, really, scare, person, sense, humor, winter)",9
4,"(question, distribution, article, write, clear, caution, warn, memory, verify, unexpected, error, wonder, expect, error, may, sorry, really, dumb, question, parity_error, memory, previously, know, condition, waivere, error, already, know, would, curious, real, meaning, expect, error, basically, know, bug, warn, system, software, thing, check, right, value, yet, set, launch, suchlike, rather, fix, code, possibly, introduce, new, bug, tell, crew, see, warn, ignore)",9
...,...,...
11287,"(scan, city, reply, line, consultation, cheap, also, well, neurologist, make, differential, diagnosis, migraine, tension, headache, cluster, benign, intracranial, hypertension, chronic, syndrome, appear, normal, scan, neurologist, also, recommend, course, treatment, appropriate, diagnosis, also, many, people, convince, brain, tumor, dn, serious, pathology, may, cheap, come, week, dn, easy, take, time, reassure, patient, personally, think, ever, justify, sigh, may, never, justifiable, sometimes, even, try, show, thoroughness, detailed, history, neurologic, examination, discussion, diagnosis, salt, lot, reassurance, patient, still, ask, can, order, scan, absolutely, sure, often, get, conversation, ignore, aunt, millie, headache, year, die, brain, tumor, aneurysm, get, away, ever, order, imaging, patient, obviously, benign, ...)",16
11288,"(screen, medford, old, problem, screen, blank, sometimes, minor, physical, jolt, insert, floppy, internal, drive, sometimes, computer, leave, go, blank, replace, wire, connect, logic, board, board, seem, first, jiggle, wire, make, screen, come, back, worked, blanking, return, nee, new, new, new, computer, thank, advice)",20
11289,"(este, mount, mail, group, line, instal, try, mount, cool, chip, hour, weight, cool, enough, dislodge, end, bend, pin, cpu, luckily, power, yet, end, press, cpu, deeply, socket, put, cpu, cooler, back, far, good, other, problem, ensure, weight, fan, heatsink, eventually, work, cpu, socket, mount, motherboard, vertical, case, este, internet)",16
11290,"(line, article, write, boy, embarasse, trivial, faq, give, point, find, center, point, know, circle, point, immediately, see, straightforward, way, check, geometry, book, farin, still, loss, mercy, provide, solution, would, require, space, point, specifie, sphere, far, see, prove, point, exist, space, distant, point, may, necessarily, happen, correct, be, wrong, quite, possibly, email, visit, computer, graphic, researcher)",9


In [158]:
d = df1.groupby(['topic'])['text']

In [185]:
all_texts_grouped = []
for topic in set(df1['topic']):
  for i in d.get_group(topic).to_list():
    sent_str = ''
    for j in i:
      one = []
      sent_str += j + ' '
      one.append(sent_str)
    all_texts_grouped.append(one)

In [187]:
all_main_words = []
for g in all_grouped:
  tfIdfVectorizer=TfidfVectorizer(use_idf=True)
  tfIdf = tfIdfVectorizer.fit_transform(g)
  df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
  df = df.sort_values('TF-IDF', ascending=False)
  all_main_words.append(list(df.index[:5]))

In [192]:
df_tfidf = pd.DataFrame()
df_tfidf['main words'] = all_main_words
df_tfidf['text'] = all_texts_grouped
df_tfidf['topic'] = list(df1['topic'])
df_tfidf

Unnamed: 0,main words,text,topic
0,"[ice, afew, come, fish, know]",[night ice meaning symbolism use throw fish ice spokane afew never know come ],16
1,"[aftersleep, look]",[aftersleep look ],16
2,[fool],[fool ],16
3,[jumper],[jumper ],9
4,"[failure, iisi, iron, owner, report]",[report failure iisi owner slow iron ],9
...,...,...,...
11287,"[chigger, thing, good, get, lot]",[jigger originator host line may world greatest expert chigger type indigenous south certainly spend lot time contemplate little bugger year move observation gain painful experience reaction chigger vary greatly person person people get tiny red bite other sensitive get fairly large swollen sore affair bite gift keep give swear thing itch month lot folklore chigger think fiction try research critter effect book could find single book uncs special collection library yet go require get base experience family member old folk remedy fingernail simply work recall_reade theory base chigger burrow skin continue party false think likely reaction toxin sort little pest release speculation good approach prevention couple thing work well good insect repellent wood liberally apply waistband good start preparation call away sulfur kind cream cortisone originally prepare army commercially_available summer put ankle morning get weekend literally can go outside live country serious consequence apparently like sulfur much sulfur dust body clothing amount prevention completely successful forget fingernail polish finally settle treatment involve topical application combination cortisone reduce inflamation swell relieve will tell thing have try tell thing wife count minor surgery best mention also think gain swell itching also significantly relieve application hot pack seem speed recovery well doctor seem care much chigger urban suburban doctor apparently encounter much rural doctor seem regard force nature must endure suspect could come good treatment chigger would make lot money principal system development ],16
11288,"[thing, people, know, believe, be]",[line would share thought topic arrogance response encounter christian find dismay belief faith total truth accord belief come word truth thus know truth stance make difficult discuss faith hesitation see way truth see faith arise willful choice believe particular way choice part faith part reason seem choice discussion remind schoolyard discussion grade school kid would say policeman would ask know tell know be daddy right say s always argument usually stop right end kid grope declare belief false third time browse be cover tired old ground discussion topic pique interest welcome comment drawer be sort mystify may respond understand criticism say s enough evidence believe s good evidence religion agree clearly plenty intelligent people find evidence convincing seem point rather seem upset people believe also believe thing contradict false suggest model spiritual thing s rather different sound existentialist view people choose value follow there s actual independent spiritual reality way say specific choice unique sense right sort model modification sort may appropriate religion christianity essense historical religion base concept actual spiritual entity intervene history specific way see evidence history mundane world free choose thing work drop fall aside well define situation christian concept spiritual matter also actual external reality hope honest enough claim perfect understanding may think know confident know thing imply think thing contradict false see else could proceed need result arrogance be certainly interested talk people religion may thing teach even respect fellow get possible respect people also think matter wrong maybe even disasterously wrong clh ],20
11289,"[people, gay, even, personal, society]",[many homosexual state free net line nntp_posting tired debate many gay argument basically worthless imho would really matter million people regularly deny access housing employment personal security even death threat happen know personal experience gay people far likely receive base political veiws even personal philosophy relate issue ual week go personally friend physically verbally harrasse even appear gay garaunteed certain unalienable right current form government theory yet day gay people victimize local government police force part uninforme ignorant public democracy think sense judge basis treatment people make society people include gay lesbian bisexual crime victim vary diverse society wich part arab bassoonist unite ],16
11290,"[would, make, gainey, hockey, claim]",[stat organization sudbury line write gilmour take completely surprise gainey would say play technically smart hockey case claim gainey never make technical mistake absolutely ludicrous later post make reference put word people mouth would suggest last paragraph interpret way namely claim gainey never make technical mistake actually read have write find make claim soooo logic serve be contradict nonsense quite clearly state greg make claim gainey never make error make claim read hockey message would nonsense delete gainey good ever stand assessment good player belong ever watch play never make technical error nonsense delete good would display ignorance course be sure think ignorance really function lack knowledge formulating opinion need take cheap shot mean make feel better knowledgeable observer game mean obvious hockey education responsibility word vehemence poor old bother much effective player style play player laud find bothersome hate hate realize likely aesthetically pleasing player ever skate lifetime would talk gainey go around would rather check matter would take checking centre think could cover be really sorry roger lose completely ask would rather net play hockey high level good would bother bring talk hockey player can follow conversation follow say previously responsibility educate compare say example would balanced comparison sure journeyman big deal worth discuss be wrong hmmm let see wrong would take fuhr sanderson first place be guess rog feeling have setup be wrong macro key machine excellent idea decide waste time respond greg posting sure implement would suggest comment press run thing say star dynasty start hype demonstrate blanket disregard individual contributor game settle claim hockey god claim gainey eat thread know respond blanket disregard individual remember leaf team purely populate individual win team run around tell good hockey player world congenially always may consider develop style imitation sincerest form quite sure flattery intention cmon nice ring admit good laugh right get end posting realize complete joke future go respond posting would appreciate could present cogent argument support fact glean version reality rest would recognize cordially always ],9


## coherence score 

Показатели когерентности темы оценивают степень семантического сходства между словами с высокими показателями в теме. 
Для вычисления используются две метрики: внутрення UMass и внешняя UCI. Этот показатель можно сравнить с суммой ребер на полном графе.
Внутрення метрика сравнивает как соотносится слово с соседними, а внешняя - каждое слово попарно с каждым. Для рассчета используется логарифмическая формула.
*Источники*: 
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0  
http://qpleple.com/topic-coherence-to-evaluate-topic-models/