In [1]:
import gensim
import pandas as pd

from gensim import corpora
from gensim import models

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
lgb_table = pd.read_csv("../data/lgb_table_ids.csv")
lgb_pages = pd.read_csv("../data/lgb_pages_v2.csv")

In [7]:
lgb_pages_nodups = lgb_pages.drop_duplicates()
lgb_merged = pd.merge(lgb_table, lgb_pages_nodups, how='inner', on='pageid')
lgb_merged.loc[lgb_merged.Notes == 'G.', 'Notes'] = 'G'
lgb_merged.loc[lgb_merged.title == 'Kray twins', 'Notes'] = 'B'

In [8]:
lgb_merged_nodups = lgb_merged.loc[:, ['Notes', 'title', 'pageid', 'url', 'summary', 'content']].drop_duplicates()

In [9]:
lgb_merged_nodups.reset_index(drop=True, inplace=True)

## Preprocessing

In [11]:
texts = lgb_merged_nodups['content']

In [12]:
stops = stopwords.words('english')
pstemmer = PorterStemmer()

In [30]:
test = texts[0]

In [35]:
gensim.parsing.preprocessing.strip_punctuation(test)
gensim.parsing.preprocessing.DEFAULT_FILTERS

[<function gensim.parsing.preprocessing.<lambda>>,
 <function gensim.parsing.preprocessing.strip_tags>,
 <function gensim.parsing.preprocessing.strip_punctuation>,
 <function gensim.parsing.preprocessing.strip_multiple_whitespaces>,
 <function gensim.parsing.preprocessing.strip_numeric>,
 <function gensim.parsing.preprocessing.remove_stopwords>,
 <function gensim.parsing.preprocessing.strip_short>,
 <function gensim.parsing.preprocessing.stem_text>]

In [13]:
texts_preprocessed = gensim.parsing.preprocessing.preprocess_documents(texts)

In [76]:
dictionary = corpora.Dictionary(texts_preprocessed)

In [62]:
# dictionary.filter_extremes(no_below=5, no_above=1)

In [77]:
corpus = [dictionary.doc2bow(text) for text in texts_preprocessed]

In [78]:
len(dictionary.values())

81055

In [79]:
tfidf = models.TfidfModel(corpus)

In [80]:
corpus_tfidf = tfidf[corpus]

## Models

In [18]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)

In [19]:
lsi.print_topics(2)

[(0,
  '0.131*"film" + 0.111*"album" + 0.104*"music" + 0.104*"award" + 0.103*"elect" + 0.088*"theatr" + 0.082*"isbn" + 0.081*"plai" + 0.079*"book" + 0.076*"perform"'),
 (1,
  '-0.381*"elect" + -0.205*"democrat" + -0.165*"district" + 0.155*"album" + -0.150*"senat" + -0.129*"parliament" + -0.129*"vote" + -0.123*"repres" + -0.122*"legisl" + -0.118*"parti"')]

The first topic is about the arts, the second is about politics.

In [20]:
lsi2 = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)

In [21]:
lsi2.print_topics(5)

[(0,
  '0.131*"film" + 0.111*"album" + 0.104*"music" + 0.104*"award" + 0.103*"elect" + 0.088*"theatr" + 0.082*"isbn" + 0.080*"plai" + 0.079*"book" + 0.076*"perform"'),
 (1,
  '-0.380*"elect" + -0.205*"democrat" + -0.165*"district" + 0.155*"album" + -0.151*"senat" + -0.130*"parliament" + -0.129*"vote" + -0.123*"repres" + -0.122*"legisl" + -0.118*"parti"'),
 (2,
  '-0.371*"album" + 0.188*"isbn" + -0.171*"song" + 0.162*"poetri" + 0.159*"novel" + -0.154*"record" + -0.148*"music" + -0.144*"singl" + 0.143*"book" + -0.142*"releas"'),
 (3,
  '0.404*"olymp" + 0.282*"championship" + 0.260*"team" + 0.211*"skate" + 0.207*"medal" + 0.183*"hockei" + 0.183*"cup" + 0.151*"game" + 0.147*"player" + 0.126*"athlet"'),
 (4,
  '0.275*"album" + -0.240*"film" + -0.214*"theatr" + -0.180*"broadwai" + 0.143*"poetri" + -0.141*"role" + -0.136*"actor" + -0.130*"episod" + -0.122*"star" + 0.121*"isbn"')]

In [86]:
lgb_table.sample(10)

Unnamed: 0,Name,Title,Lifetime,Nationality,Notable as,Notes
719,Conrad Cummings,Conrad Cummings,b. 1948,American,Contemporary classical composer,G
2169,Phil Reed,Phil Reed,1949–2008,American,Politician,G
373,Joshua Boschee,Joshua Boschee,b. ?,American,Politician,G
967,Jackie Forster,Jackie Forster,1926–1998,English,"Actor, activist",L
319,Tobias Billström,Tobias Billström,b. 1973,Swedish,Politician,B
2025,Douglas Pearce,Douglas P.,p. 1956,English,Musician,G
2252,Sal Rosselli,Sal Rosselli,b. 1949,American,Labor leader,G
1467,Jennifer Knapp,Jennifer Knapp,b. 1974,American,Musician,L
2461,Justin Simien,Justin Simien,b. 1983,American,"Film director, screenwriter",G
1825,Carole Migden,Carole Migden,b. 1948,American,Politician,L


check out results

In [85]:
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20, passes=5)

This is a bit slower to train...

In [58]:
lda.print_topics(20)

[(0,
  '0.001*"norwegian" + 0.001*"tam" + 0.001*"zealand" + 0.001*"tatchel" + 0.001*"auckland" + 0.000*"whittington" + 0.000*"schafer" + 0.000*"handbal" + 0.000*"willett" + 0.000*"labour"'),
 (1,
  '0.001*"oklahoma" + 0.000*"whishaw" + 0.000*"msp" + 0.000*"stasei" + 0.000*"wildeblood" + 0.000*"scagliotti" + 0.000*"stonei" + 0.000*"sexton" + 0.000*"sabin" + 0.000*"sahe"'),
 (2,
  '0.001*"norri" + 0.000*"microsoft" + 0.000*"wuorinen" + 0.000*"woollei" + 0.000*"rigg" + 0.000*"tenni" + 0.000*"savanna" + 0.000*"tewksburi" + 0.000*"schumach" + 0.000*"winterson"'),
 (3,
  '0.001*"webb" + 0.001*"rodriguez" + 0.001*"stipe" + 0.001*"warhol" + 0.001*"nba" + 0.001*"redgrav" + 0.000*"arvin" + 0.000*"preston" + 0.000*"fowler" + 0.000*"sappho"'),
 (4,
  '0.001*"der" + 0.001*"wolff" + 0.001*"und" + 0.001*"tovei" + 0.001*"ruggiero" + 0.001*"sherlock" + 0.000*"belfast" + 0.000*"tobia" + 0.000*"baldwin" + 0.000*"sade"'),
 (5,
  '0.001*"reed" + 0.001*"bartlett" + 0.001*"montana" + 0.001*"somervil" + 0.001

lots of proper names... 20 topics is no good

In [69]:
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=5, passes=5)

In [70]:
lda.print_topics(5)

[(0,
  '0.003*"skate" + 0.002*"olymp" + 0.001*"championship" + 0.001*"rodriguez" + 0.001*"hockei" + 0.001*"tremblai" + 0.001*"skater" + 0.001*"handbal" + 0.001*"team" + 0.001*"cup"'),
 (1,
  '0.002*"film" + 0.002*"music" + 0.002*"album" + 0.001*"award" + 0.001*"isbn" + 0.001*"book" + 0.001*"novel" + 0.001*"song" + 0.001*"theatr" + 0.001*"perform"'),
 (2,
  '0.001*"riksdag" + 0.001*"shimizu" + 0.001*"pell" + 0.001*"prentic" + 0.000*"takei" + 0.000*"nilsen" + 0.000*"schumach" + 0.000*"beauchamp" + 0.000*"msp" + 0.000*"roper"'),
 (3,
  '0.001*"steadman" + 0.001*"schafer" + 0.001*"duse" + 0.000*"sheng" + 0.000*"rosenth" + 0.000*"ginal" + 0.000*"ferrandino" + 0.000*"moreno" + 0.000*"dominick" + 0.000*"ruggiero"'),
 (4,
  '0.004*"elect" + 0.002*"democrat" + 0.002*"district" + 0.002*"parti" + 0.001*"serv" + 0.001*"parliament" + 0.001*"vote" + 0.001*"senat" + 0.001*"law" + 0.001*"olymp"')]

In [24]:
lda.show_topics()

[(0,
  '0.000*"solmones" + 0.000*"sinha" + 0.000*"redner" + 0.000*"pillard" + 0.000*"tisei" + 0.000*"pureka" + 0.000*"sciortino" + 0.000*"rosendahl" + 0.000*"sherrin" + 0.000*"roffman"'),
 (1,
  '0.000*"shernoff" + 0.000*"schafer" + 0.000*"sinema" + 0.000*"willett" + 0.000*"steadman" + 0.000*"wallowitch" + 0.000*"sippl" + 0.000*"scavullo" + 0.000*"stolz" + 0.000*"roellig"'),
 (2,
  '0.000*"sandin" + 0.000*"tunnei" + 0.000*"sagat" + 0.000*"tiberiu" + 0.000*"salzgeb" + 0.000*"pintauro" + 0.000*"treaci" + 0.000*"rusch" + 0.000*"tinl" + 0.000*"rapino"'),
 (3,
  '0.000*"sabado" + 0.000*"studd" + 0.000*"thai" + 0.000*"tober" + 0.000*"secter" + 0.000*"redl" + 0.000*"pichler" + 0.000*"peltzer" + 0.000*"rantel" + 0.000*"westerwel"'),
 (4,
  '0.001*"film" + 0.001*"elect" + 0.001*"music" + 0.001*"album" + 0.001*"award" + 0.001*"isbn" + 0.001*"book" + 0.001*"gai" + 0.001*"plai" + 0.001*"novel"')]

In [81]:
lda_counts = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes=5)

In [82]:
lda_counts.show_topics()

[(0,
  '0.007*"work" + 0.007*"new" + 0.006*"gai" + 0.006*"book" + 0.005*"time" + 0.005*"year" + 0.004*"univers" + 0.004*"publish" + 0.004*"isbn" + 0.003*"life"'),
 (1,
  '0.008*"elect" + 0.006*"state" + 0.005*"new" + 0.005*"gai" + 0.005*"year" + 0.004*"member" + 0.004*"school" + 0.004*"nation" + 0.004*"parti" + 0.004*"right"'),
 (2,
  '0.015*"film" + 0.010*"award" + 0.007*"new" + 0.007*"plai" + 0.006*"appear" + 0.006*"star" + 0.005*"seri" + 0.005*"work" + 0.005*"televis" + 0.005*"role"'),
 (3,
  '0.009*"work" + 0.006*"new" + 0.005*"year" + 0.005*"art" + 0.005*"york" + 0.004*"time" + 0.004*"life" + 0.003*"book" + 0.003*"publish" + 0.003*"isbn"'),
 (4,
  '0.011*"music" + 0.010*"album" + 0.009*"record" + 0.008*"song" + 0.007*"releas" + 0.006*"perform" + 0.005*"new" + 0.005*"year" + 0.004*"band" + 0.004*"work"')]

In [83]:
lda_counts2 = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=5)

In [105]:
t = lda_counts2.show_topics(num_words=10, formatted=False)

In [104]:
help(lda_counts2.show_topics)

Help on method show_topics in module gensim.models.ldamodel:

show_topics(num_topics=10, num_words=10, log=False, formatted=True) method of gensim.models.ldamodel.LdaModel instance
    Args:
        num_topics (int): show results for first `num_topics` topics.
            Unlike LSA, there is no natural ordering between the topics in LDA.
            The returned `num_topics <= self.num_topics` subset of all topics is
            therefore arbitrary and may change between two LDA training runs.
        num_words (int): include top `num_words` with highest probabilities in topic.
        log (bool): If True, log output in addition to returning it.
        formatted (bool): If True, format topics as strings, otherwise return them as
            `(word, probability)` 2-tuples.
    Returns:
        list: `num_words` most significant words for `num_topics` number of topics
        (10 words for top 10 topics, by default).



In [90]:
lda_counts = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=5)

In [93]:
lda_counts.show_topics(num_topics=20)

[(0,
  '0.007*"wild" + 0.006*"work" + 0.005*"french" + 0.005*"pari" + 0.005*"year" + 0.004*"life" + 0.004*"time" + 0.003*"publish" + 0.003*"later" + 0.003*"franc"'),
 (1,
  '0.006*"work" + 0.004*"new" + 0.004*"bbc" + 0.004*"life" + 0.004*"present" + 0.004*"year" + 0.004*"time" + 0.003*"war" + 0.003*"radio" + 0.003*"later"'),
 (2,
  '0.021*"music" + 0.011*"tchaikovski" + 0.009*"work" + 0.008*"compos" + 0.007*"new" + 0.005*"bernstein" + 0.005*"perform" + 0.005*"symphoni" + 0.004*"russian" + 0.004*"time"'),
 (3,
  '0.014*"elect" + 0.009*"parti" + 0.008*"gai" + 0.008*"right" + 0.007*"member" + 0.006*"state" + 0.006*"polit" + 0.006*"govern" + 0.006*"new" + 0.005*"campaign"'),
 (4,
  '0.010*"ginsberg" + 0.010*"rio" + 0.008*"seel" + 0.008*"sprinkl" + 0.006*"brazilian" + 0.006*"paulo" + 0.005*"swedish" + 0.005*"barreto" + 0.005*"davi" + 0.004*"janeiro"'),
 (5,
  '0.011*"gai" + 0.010*"film" + 0.007*"televis" + 0.006*"award" + 0.006*"host" + 0.005*"new" + 0.005*"produc" + 0.005*"seri" + 0.005*"a

In [88]:
lda_counts.show_topics()

[(0,
  '0.008*"work" + 0.005*"time" + 0.004*"publish" + 0.004*"year" + 0.004*"book" + 0.004*"life" + 0.004*"write" + 0.003*"new" + 0.003*"novel" + 0.003*"later"'),
 (1,
  '0.015*"film" + 0.007*"new" + 0.006*"year" + 0.005*"plai" + 0.005*"work" + 0.005*"star" + 0.004*"award" + 0.004*"york" + 0.004*"life" + 0.004*"appear"'),
 (2,
  '0.009*"gai" + 0.007*"elect" + 0.007*"new" + 0.006*"state" + 0.005*"univers" + 0.005*"right" + 0.005*"member" + 0.004*"school" + 0.004*"work" + 0.004*"year"'),
 (3,
  '0.015*"music" + 0.008*"album" + 0.007*"perform" + 0.007*"record" + 0.007*"new" + 0.006*"song" + 0.006*"award" + 0.006*"work" + 0.006*"releas" + 0.005*"plai"'),
 (4,
  '0.010*"new" + 0.008*"work" + 0.007*"art" + 0.006*"film" + 0.006*"york" + 0.004*"life" + 0.004*"book" + 0.004*"year" + 0.003*"includ" + 0.003*"american"'),
 (5,
  '0.007*"award" + 0.006*"book" + 0.005*"plai" + 0.005*"year" + 0.005*"team" + 0.004*"appear" + 0.004*"world" + 0.004*"time" + 0.004*"new" + 0.004*"seri"')]

In [86]:
lda_counts.show_topics()

[(0,
  '0.009*"film" + 0.009*"new" + 0.008*"work" + 0.006*"book" + 0.005*"york" + 0.005*"award" + 0.005*"art" + 0.004*"life" + 0.004*"publish" + 0.004*"time"'),
 (1,
  '0.012*"music" + 0.007*"album" + 0.007*"record" + 0.007*"perform" + 0.006*"new" + 0.006*"song" + 0.005*"year" + 0.005*"releas" + 0.005*"work" + 0.005*"plai"'),
 (2,
  '0.007*"elect" + 0.006*"gai" + 0.005*"state" + 0.004*"new" + 0.004*"year" + 0.004*"work" + 0.004*"member" + 0.004*"univers" + 0.004*"right" + 0.003*"school"'),
 (3,
  '0.005*"year" + 0.005*"appear" + 0.005*"plai" + 0.004*"role" + 0.004*"seri" + 0.004*"time" + 0.003*"life" + 0.003*"televis" + 0.003*"film" + 0.003*"later"')]

In [87]:
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=2, passes=5)

In [68]:
lda.show_topics(num_words=20)

[(0,
  '0.001*"film" + 0.001*"elect" + 0.000*"music" + 0.000*"album" + 0.000*"award" + 0.000*"isbn" + 0.000*"book" + 0.000*"gai" + 0.000*"plai" + 0.000*"theatr" + 0.000*"novel" + 0.000*"york" + 0.000*"smith" + 0.000*"perform" + 0.000*"record" + 0.000*"song" + 0.000*"lesbian" + 0.000*"televis" + 0.000*"london" + 0.000*"art"'),
 (1,
  '0.000*"saisio" + 0.000*"sillanpää" + 0.000*"lepenski" + 0.000*"vir" + 0.000*"pirkko" + 0.000*"honkasalo" + 0.000*"srejović" + 0.000*"lpga" + 0.000*"markström" + 0.000*"pirjo" + 0.000*"lavesson" + 0.000*"hashiguchi" + 0.000*"seger" + 0.000*"riksdag" + 0.000*"holma" + 0.000*"olof" + 0.000*"cermanović–kuzmanović" + 0.000*"aleksandrina" + 0.000*"beograd" + 0.000*"skz"')]