In [1]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

In [147]:
def get_categorymembers(category, level=0, max_level=0, verbose=False, pages=None):
    """
    Return a list of all wikipedia pages from a given category.
    Categories are themselves pages -> The list includes pages from subcategories that
    are members of a higher level category (recursively).
    
    NOTE: duplicates are not removed.
    
    Args:
    ----
        category (str)
        level (int): current level of the category
        max_level (int): maximum level of the recursion for subcategories
        verbose (bool)
        
    Returns:
    -------
        pages (list): list of `page`.
    
    """
    if pages is None:
        pages = []
    members = category.categorymembers
    for page in members.values():
        info_str = "{0:s}: {1:s} (ns: {2:d})".format('*' * (level + 1), page.title, page.ns)
        if page.ns == wikipediaapi.Namespace.MAIN:
            pages.append(page)
            if verbose:
                print("{0:70.70}{1:>27}".format(info_str, 'PAGE ADDED'))
        elif page.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            if verbose:
                print("(SUBCATEGORY) ", info_str)
            get_categorymembers(page, level + 1, verbose=verbose, pages = pages)
    return pages

In [148]:
# cat = wiki_wiki.page("Category:Science fiction novels by writer")
cat = wiki_wiki.page("Category:Novels by Isaac Asimov")
pages = get_categorymembers(cat, max_level=1, verbose=True)

(SUBCATEGORY)  *: Category:Mystery novels by Isaac Asimov (ns: 14)
**: The Caves of Steel (ns: 0)                                                         PAGE ADDED
**: The Death Dealers (ns: 0)                                                          PAGE ADDED
**: Murder at the ABA (ns: 0)                                                          PAGE ADDED
**: The Naked Sun (ns: 0)                                                              PAGE ADDED
**: The Robots of Dawn (ns: 0)                                                         PAGE ADDED
(SUBCATEGORY)  *: Category:Science fiction novels by Isaac Asimov (ns: 14)
**: The Caves of Steel (ns: 0)                                                         PAGE ADDED
**: The Currents of Space (ns: 0)                                                      PAGE ADDED
**: David Starr, Space Ranger (ns: 0)                                                  PAGE ADDED
**: The End of Eternity (ns: 0)                                           

In [207]:
pages

[The Caves of Steel (id: 60129, ns: 0),
 The Death Dealers (id: 1639433, ns: 0),
 Murder at the ABA (id: 1693075, ns: 0),
 The Naked Sun (id: 414331, ns: 0),
 The Robots of Dawn (id: 420130, ns: 0),
 The Caves of Steel (id: 60129, ns: 0),
 The Currents of Space (id: 474204, ns: 0),
 David Starr, Space Ranger (id: 1662948, ns: 0),
 The End of Eternity (id: 146188, ns: 0),
 Fantastic Voyage (id: 58377079, ns: 0),
 Fantastic Voyage II: Destination Brain (id: 1139964, ns: 0),
 Forward the Foundation (id: 371532, ns: 0),
 Foundation (Asimov novel) (id: 309409, ns: 0),
 Foundation and Earth (id: 60131, ns: 0),
 Foundation and Empire (id: 331675, ns: 0),
 Foundation's Edge (id: 60128, ns: 0),
 The Gods Themselves (id: 301587, ns: 0),
 Lucky Starr and the Big Sun of Mercury (id: 1664759, ns: 0),
 Lucky Starr and the Moons of Jupiter (id: 1664760, ns: 0),
 Lucky Starr and the Oceans of Venus (id: 1664763, ns: 0),
 Lucky Starr and the Pirates of the Asteroids (id: 1664765, ns: 0),
 Lucky Starr a

In [291]:
# [[pages[j].sections[i].title for i in range(len(pages[j].sections))] for j in range(len(pages))]
def check_section_title(title):
    return bool(re.search('plot|character|summary|topic|theme|summari|background|origin|introduction|concept|story|symbol', title.lower()))
                
# [list(filter(check_section_title, [pages[j].sections[i].title for i in range(len(pages[j].sections))])) for j in range(len(pages))]

for p in pages:
    for s in p.sections:
        print(s.title, check_section_title(s.title))

Plot introduction True
Plot summary True
Characters True
Reception False
Adaptations False
References False
External links False
Plot summary True
Background science True
Characters True
External links False
Origins True
Plot outline True
Characters True
See also False
References and links False
External links False
Plot True
Reception False
Adaptations False
References False
External links False
Plot summary True
Characters True
References False
External links False
Plot introduction True
Plot summary True
Characters True
Reception False
Adaptations False
References False
External links False
Plot summary True
Reception False
Footnotes False
Sources False
External links False
Plot summary True
Themes True
Reception False
References False
External links False
Plot True
Concepts True
Major characters True
Origins True
Reception False
Role in Foundation series False
Translations False
Movie adaptations False
References False
Sources False
External links False
Plot True
Cast False
Product

In [278]:
if re.search('plot|test|car', 'asdplcaroestcrs'.lower()):
    print('yes!')

yes!


In [150]:
import re
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models, corpora
from pprint import pprint

# nltk.download('all')

In [151]:
print(len(set(STOPWORDS) - set(stopwords.words('english'))))
print(len(set(stopwords.words('english')) - set(STOPWORDS)))

211
53


In [152]:
STOPWORDS_ALL = set(STOPWORDS).union(set(stopwords.words('english')))
print(len(STOPWORDS_ALL))

390


In [318]:
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
    
def clean_text(text):
    tokens = word_tokenize(text.lower())
    clean_tokens = [t for t in tokens if (
        t not in STOPWORDS_ALL and
        len(t) > 2)]
#         re.match('[a-zA-Z\-]{3,}', t)
#         re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
#     )]
    token_lemmas = [get_lemma(token) for token in clean_tokens]
    return token_lemmas

def select_and_clean_text(page):
    tokens = []
    for section in page.sections:
        if check_section_title(section.title):
            tokens += clean_text(section.text)
    return tokens

In [329]:
print(clean_text(pages[0].text)[:10])

# print(pages[0].text)
# print(pages[0].sections[0])
print(select_and_clean_text(pages[0])[:10])

['cave', 'steel', 'science', 'fiction', 'novel', 'american', 'writer', 'isaac', 'asimov', 'detective']
['novel', 'isaac', 'asimov', 'introduce', 'elijah', 'baley', 'daneel', 'olivaw', 'later', 'favorite']


In [174]:
print([lemmatizer.lemmatize(s) for s in ['rocks', 'corpora', 'better', 'asdasdasd']])
print([wn.morphy(s) for s in ['rocks', 'corpora', 'better', 'asdasdasd']])
print([get_lemma(s) for s in ['rocks', 'corpora', 'better', 'asdasdasd']])

['rock', 'corpus', 'better', 'asdasdasd']
['rock', 'corpus', 'better', None]
['rock', 'corpus', 'better', 'asdasdasd']


In [337]:
tokenized_data = []
for p in pages:
    tokenized_data.append(select_and_clean_text(p))

In [338]:
# Build a Dictionary - association word to numeric id
id2word = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [id2word.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 4th document looks like: [(word_id, count), ...]
# (only the first 10 words)
print(corpus[1][:2])

[(54, 1), (59, 1)]


In [339]:
print(id2word.doc2bow(tokenized_data[0])[:5])
print([id2word[i] for i in range(100)])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
["'caves", "'how", "'olden", "'open", "'what", '1950s', '1953', 'able', 'accept', 'accident', 'accidentally', 'accomplish', 'actually', 'advance', 'advice', 'ago', 'agoraphobia', 'agree', 'air', 'ambassador', 'answer', 'antagonism', 'anthony', 'anti-robot', 'apartment', 'appearance', 'arrest', 'asimov', 'asimovian', 'aspect', 'assign', 'astonishment', 'attribute', 'aurora', 'average', 'away', 'baley', 'barrett', 'begin', 'beginning', 'behaviour', 'belief', 'bentley', 'biblical', 'billion', 'birth', 'blaster', 'block', 'book', 'brain', 'bring', 'call', 'campbell', 'capable', 'case', 'cause', 'cave', 'central', 'character', 'charge', 'chest', 'city', 'claustrophile', 'clear', 'close', 'clousarr', 'colonization', 'colonize', 'colonized—fifty', 'come', 'comfortably', 'commissioner', 'complex', 'concert', 'connect', 'consider', 'continue', 'convert', 'convince', 'cooperation', 'countenance', 'cover', 'create', 'crime', 'culture', 'curiosity', 'daneel

In [340]:
NUM_TOPICS = 3

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=id2word)
 
# Build the LSI model
# lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=id2word)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [341]:
text = tokenized_data[1]
bow = id2word.doc2bow(text)
 
# print(lsi_model[bow])
print(lda_model[bow])

[(1, 0.9975588)]


In [342]:
print("LDA Model:")

# Print the Keyword in all topics

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

print("=" * 20)
 
print("LSI Model:")
 
pprint(lsi_model.print_topics())
doc_lsi = lsi_model[corpus]
    
print("=" * 20)

LDA Model:
[(0,
  '0.011*"robot" + 0.011*"baley" + 0.010*"earth" + 0.009*"starr" + '
  '0.007*"asimov" + 0.005*"novel" + 0.004*"story" + 0.004*"daneel" + '
  '0.004*"seldon" + 0.004*"gladia"'),
 (1,
  '0.009*"starr" + 0.007*"robot" + 0.007*"baley" + 0.005*"asimov" + '
  '0.005*"novel" + 0.005*"earth" + 0.004*"time" + 0.004*"book" + '
  '0.004*"planet" + 0.003*"story"'),
 (2,
  '0.011*"starr" + 0.007*"earth" + 0.006*"robot" + 0.006*"asimov" + '
  '0.005*"foundation" + 0.005*"novel" + 0.005*"baley" + 0.004*"bigman" + '
  '0.004*"time" + 0.004*"story"')]
LSI Model:
[(0,
  '0.354*"foundation" + 0.297*"asimov" + 0.255*"novel" + 0.192*"robot" + '
  '0.189*"earth" + 0.170*"story" + 0.158*"series" + 0.157*"empire" + '
  '0.153*"book" + 0.144*"fiction"'),
 (1,
  '0.588*"foundation" + -0.252*"baley" + -0.230*"robot" + -0.176*"starr" + '
  '0.163*"seldon" + 0.157*"second" + -0.143*"novel" + 0.141*"mule" + '
  '-0.133*"earth" + 0.132*"hardin"'),
 (2,
  '-0.489*"starr" + 0.380*"baley" + 0.266*"robo

In [343]:
doc_lda

<gensim.interfaces.TransformedCorpus at 0x1a2e4e0a50>

In [344]:
from gensim import similarities
 
# basically the calculate the cosine of a given vector with the vectors in lda_model[corpus]
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities)
# It should be itself...


# Let's see what's the most similar document
document_id, similarity = similarities[0]
# print(pages[document_id].text[:1000])
 

[(1, 1.0), (10, 1.0), (22, 1.0), (33, 1.0), (14, 0.57735026), (32, 0.57735026), (8, 0.32445747), (16, 0.30843654), (6, 0.08668428), (13, 0.011009008), (0, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (7, 0.0), (9, 0.0), (11, 0.0), (12, 0.0), (15, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (34, 0.0)]


In [345]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [184]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()
nltk.download('wordnet')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 2:
#             result.append(lemmatize_stemming(token))
            result.append(token)
    return result

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrienbolens/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [147]:
res = preprocess(pages[0].text)

In [160]:
re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', 'aad')

<_sre.SRE_Match object; span=(0, 3), match='aad'>