In [4]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

In [6]:
def get_categorymembers(category, level=0, max_level=0, verbose=False, pages=None):
    """
    Return a list of all wikipedia pages from a given category.
    Categories are themselves pages -> The list includes pages from subcategories that
    are members of a higher level category (recursively).
    
    NOTE: duplicates are not removed.
    
    Args:
    ----
        category (str)
        level (int): current level of the category
        max_level (int): maximum level of the recursion for subcategories
        verbose (bool)
        
    Returns:
    -------
        pages (list): list of `page`.
    
    """
    if pages is None:
        pages = []
    members = category.categorymembers
    for page in members.values():
        info_str = "{0:s}: {1:s} (ns: {2:d})".format('*' * (level + 1), page.title, page.ns)
        if page.ns == wikipediaapi.Namespace.MAIN:
            pages.append(page)
            if verbose:
                print("{0:70.70}{1:>27}".format(info_str, 'PAGE ADDED'))
        elif page.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            if verbose:
                print("(SUBCATEGORY) ", info_str)
            get_categorymembers(page, level + 1, verbose=verbose, pages = pages)
    return pages

In [7]:
# cat = wiki_wiki.page("Category:Science fiction novels by writer")
cat = wiki_wiki.page("Category:Novels by Isaac Asimov")
pages = get_categorymembers(cat, max_level=1, verbose=True)

(SUBCATEGORY)  *: Category:Mystery novels by Isaac Asimov (ns: 14)
**: The Caves of Steel (ns: 0)                                                         PAGE ADDED
**: The Death Dealers (ns: 0)                                                          PAGE ADDED
**: Murder at the ABA (ns: 0)                                                          PAGE ADDED
**: The Naked Sun (ns: 0)                                                              PAGE ADDED
**: The Robots of Dawn (ns: 0)                                                         PAGE ADDED
(SUBCATEGORY)  *: Category:Science fiction novels by Isaac Asimov (ns: 14)
**: The Caves of Steel (ns: 0)                                                         PAGE ADDED
**: The Currents of Space (ns: 0)                                                      PAGE ADDED
**: David Starr, Space Ranger (ns: 0)                                                  PAGE ADDED
**: The End of Eternity (ns: 0)                                           

In [17]:
import re
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models, corpora
from pprint import pprint

# nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloadin

[nltk_data]    | Downloading package senseval to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/senseval.zip.
[nltk_data]    | Downloading package sentiwordnet to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/sentiwordnet.zip.
[nltk_data]    | Downloading package sentence_polarity to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/sentence_polarity.zip.
[nltk_data]    | Downloading package shakespeare to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/shakespeare.zip.
[nltk_data]    | Downloading package sinica_treebank to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/sinica_treebank.zip.
[nltk_data]    | Downloading package smultron to
[nltk_data]    |     /home/bolensadrien/nltk_data...
[nltk_data]    |   Unzipping corpora/smultron.zip.
[nltk_data]    | Downloading package

True

In [16]:
def check_section_title(title):
    return bool(re.search('plot|'
                          'character|'
                          'summary|'
                          'topic|'
                          'theme|'
                          'summari|'
                          'background|'
                          'origin|'
                          'introduction|'
                          'concept|'
                          'symbol', title.lower()))
                
for p in pages:
    pprint([s.title for s in p.sections if check_section_title(s.title)])


['Plot introduction', 'Plot summary', 'Characters']
['Plot summary', 'Background science', 'Characters']
['Origins', 'Plot outline', 'Characters']
['Plot']
['Plot summary', 'Characters']
['Plot introduction', 'Plot summary', 'Characters']
['Plot summary']
['Plot summary', 'Themes']
['Plot', 'Concepts', 'Major characters', 'Origins']
['Plot', 'Similarly themed works']
['Conception', 'Plot']
['Plot']
['Origin and early publication history',
 'Background',
 'Plot summary',
 'Characters']
['Plot introduction', 'Plot summary']
['Plot summary', 'Characters']
['Plot summary']
['Plot summary']
['Plot summary', 'Themes']
['Plot summary', 'Themes']
['Plot summary', 'Themes']
['Plot summary', 'Themes']
['Plot summary', 'Themes']
[]
['Plot']
['Plot summary', 'Major characters']
['Background', 'Plot summary']
['Plot summary']
['Story background', 'Plot summary']
['Plot summary']
['Plot summary', 'Characters']
['Plot summary']
['Plot summary', 'Characters']
['Plot summary']
['Plot']
['Plot summary',

In [25]:
print(len(set(STOPWORDS) - set(stopwords.words('english'))))
print(len(set(stopwords.words('english')) - set(STOPWORDS)))

211
53


In [26]:
STOPWORDS_ALL = set(STOPWORDS).union(set(stopwords.words('english')))
print(len(STOPWORDS_ALL))

390


In [75]:
from nltk.corpus import wordnet as wn

# import spacy
# spacy.load('en')
# from spacy.lang.en import English
# parser = English()

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
# from nltk.stem import WordNetLemmatizer 
  
# lemmatizer = WordNetLemmatizer() 
    
def clean_text(text):
    tokens = word_tokenize(text.lower())
    clean_tokens = [t for t in tokens if (
        t not in STOPWORDS_ALL and
        len(t) > 2)]
#         re.match('[a-zA-Z\-]{3,}', t)
#         re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
#     )]
    token_lemmas = [get_lemma(token) for token in clean_tokens]
    return token_lemmas

def clean_text2(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
            
    token_lemmas = [get_lemma(token) for token in lda_tokens]
#     return lda_tokens 
    return token_lemmas

def select_and_clean_text(page):
    tokens = []
    for section in page.sections:
#         if check_section_title(section.title):
        if True:
            tokens += clean_text(section.text)
    return tokens

In [76]:
print(clean_text(pages[0].text)[:10])

# print(pages[0].text)
# print(pages[0].sections[0])
print(select_and_clean_text(pages[0])[:10])

['cave', 'steel', 'science', 'fiction', 'novel', 'american', 'writer', 'isaac', 'asimov', 'detective']
['novel', 'isaac', 'asimov', 'introduce', 'elijah', 'baley', 'daneel', 'olivaw', 'later', 'favorite']


In [77]:
# print([lemmatizer.lemmatize(s) for s in ['rocks', 'corpora', 'better', 'asdasdasd']])
print([wn.morphy(s) for s in ['rocks', 'corpora', 'better', 'asdasdasd']])
print([get_lemma(s) for s in ['rocks', 'corpora', 'better', 'asdasdasd']])

['rock', 'corpus', 'better', None]
['rock', 'corpus', 'better', 'asdasdasd']


In [78]:
tokenized_data = []
for p in pages:
    tokenized_data.append(select_and_clean_text(p))

In [79]:
# Build a Dictionary - association word to numeric id
id2word = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [id2word.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 4th document looks like: [(word_id, count), ...]
# (only the first three words)
example_corpus = corpus[3][:3]
print(example_corpus)
print([id2word[i] for i, _ in example_corpus])

[(13, 3), (18, 1), (19, 2)]
['able', 'actually', 'adapt']


In [80]:
print(id2word.doc2bow(tokenized_data[0])[:5])
print([id2word[i] for i in range(100)])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
["'caves", "'how", "'olden", "'open", "'ve", "'what", '1950s', '1953', '1954', '1964', '1989', '2004', '2016', 'able', 'accept', 'accident', 'accidentally', 'accomplish', 'actually', 'adapt', 'adaptation', 'advance', 'advice', 'ago', 'agoraphobia', 'agree', 'air', 'akiva', 'ambassador', 'answer', 'antagonism', 'anthony', 'anti-robot', 'apartment', 'appearance', 'archive', 'arrest', 'asimov', 'asimovian', 'aspect', 'assign', 'astonishment', 'attempt', 'attribute', 'aurora', 'average', 'award', 'away', 'background', 'baley', 'barrett', 'bbc', 'begin', 'beginning', 'behaviour', 'belief', 'bentley', 'bert', 'best', 'biblical', 'billion', 'birth', 'bishop', 'blaster', 'block', 'book', 'boucher', 'brain', 'bring', 'bureaucratically', 'call', 'campbell', 'capable', 'case', 'cause', 'cave', 'central', 'character', 'charge', 'chest', 'city', 'claustrophile', 'clear', 'close', 'clousarr', 'colonization', 'colonize', 'colonized—fifty', 'combination', 'comb

In [81]:
NUM_TOPICS = 3

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=id2word)
 
# Build the LSI model
# lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=id2word)

In [82]:
text = tokenized_data[1]
bow = id2word.doc2bow(text)
 
# print(lsi_model[bow])
print(lda_model[bow])

[(0, 0.99767035)]


In [83]:
print("LDA Model:")

# Print the Keyword in all topics

pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

print("=" * 20)
 
# print("LSI Model:")
 
# pprint(lsi_model.print_topics())
# doc_lsi = lsi_model[corpus]
    
# print("=" * 20)

LDA Model:
[(0,
  '0.009*"baley" + 0.008*"earth" + 0.008*"starr" + 0.007*"robot" + '
  '0.006*"asimov" + 0.005*"novel" + 0.005*"foundation" + 0.004*"time" + '
  '0.003*"world" + 0.003*"write"'),
 (1,
  '0.009*"asimov" + 0.008*"robot" + 0.008*"baley" + 0.007*"starr" + '
  '0.006*"novel" + 0.006*"earth" + 0.006*"story" + 0.004*"time" + 0.004*"book" '
  '+ 0.004*"foundation"'),
 (2,
  '0.009*"robot" + 0.009*"asimov" + 0.008*"starr" + 0.008*"novel" + '
  '0.006*"book" + 0.006*"earth" + 0.005*"story" + 0.005*"foundation" + '
  '0.004*"fiction" + 0.004*"seldon"')]


In [84]:
print([a for a in doc_lda])

[[(0, 0.99895793)], [(0, 0.9976705)], [(2, 0.9971428)], [(1, 0.010094799), (2, 0.98915887)], [(1, 0.9987263)], [(0, 0.99895805)], [(2, 0.9979732)], [(0, 0.99893624)], [(2, 0.9916454)], [(1, 0.015623452), (2, 0.9840673)], [(1, 0.087184444), (2, 0.91129506)], [(2, 0.99629533)], [(2, 0.99669194)], [(0, 0.82272005), (1, 0.026507914), (2, 0.15077206)], [(2, 0.99412894)], [(0, 0.026046062), (1, 0.13702397), (2, 0.83693)], [(1, 0.9687337), (2, 0.030332271)], [(2, 0.9985265)], [(2, 0.99762684)], [(2, 0.9964418)], [(1, 0.9987057)], [(2, 0.9988942)], [(2, 0.9966055)], [(2, 0.99020594)], [(0, 0.9980357)], [(1, 0.3650811), (2, 0.6344041)], [(2, 0.9969212)], [(2, 0.9989096)], [(2, 0.9966111)], [(2, 0.9985781)], [(1, 0.90264976), (2, 0.09663812)], [(1, 0.9987262)], [(2, 0.9969156)], [(1, 0.0122757405), (2, 0.9870892)], [(1, 0.9984863)]]


In [85]:
i = 3
print([a for a in lda_model[corpus][i]])
print([a for a in lda_model[corpus[i]]])
print([a for a in lda_model[corpus]][i])

[(2, 0.9926275)]
[(2, 0.99147034)]
[(1, 0.011284761), (2, 0.9879689)]


In [86]:
titles = [p.title for p in pages]

In [87]:
list(zip([[i for i, _ in a] for a in list(doc_lda)], titles))

[([0], 'The Caves of Steel'),
 ([0], 'The Death Dealers'),
 ([2], 'Murder at the ABA'),
 ([2], 'The Naked Sun'),
 ([1], 'The Robots of Dawn'),
 ([0], 'The Caves of Steel'),
 ([2], 'The Currents of Space'),
 ([0], 'David Starr, Space Ranger'),
 ([2], 'The End of Eternity'),
 ([1, 2], 'Fantastic Voyage'),
 ([1, 2], 'Fantastic Voyage II: Destination Brain'),
 ([2], 'Forward the Foundation'),
 ([2], 'Foundation (Asimov novel)'),
 ([0, 1, 2], 'Foundation and Earth'),
 ([2], 'Foundation and Empire'),
 ([0, 1, 2], "Foundation's Edge"),
 ([1, 2], 'The Gods Themselves'),
 ([2], 'Lucky Starr and the Big Sun of Mercury'),
 ([2], 'Lucky Starr and the Moons of Jupiter'),
 ([2], 'Lucky Starr and the Oceans of Venus'),
 ([1], 'Lucky Starr and the Pirates of the Asteroids'),
 ([2], 'Lucky Starr and the Rings of Saturn'),
 ([2], 'Lucky Starr series'),
 ([1, 2], 'The Naked Sun'),
 ([0], 'Nemesis (Asimov novel)'),
 ([1, 2], 'Nightfall (Asimov novelette and novel)'),
 ([2], 'Norby, the Mixed-Up Robot'),
 

In [344]:
from gensim import similarities
 
# basically the calculate the cosine of a given vector with the vectors in lda_model[corpus]
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities)
# It should be itself...


# Let's see what's the most similar document
document_id, similarity = similarities[0]
# print(pages[document_id].text[:1000])
 

[(1, 1.0), (10, 1.0), (22, 1.0), (33, 1.0), (14, 0.57735026), (32, 0.57735026), (8, 0.32445747), (16, 0.30843654), (6, 0.08668428), (13, 0.011009008), (0, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (7, 0.0), (9, 0.0), (11, 0.0), (12, 0.0), (15, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (34, 0.0)]


In [88]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
pyLDAvis.display(lda_display)

ModuleNotFoundError: No module named 'pyLDAvis'

In [184]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()
nltk.download('wordnet')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 2:
#             result.append(lemmatize_stemming(token))
            result.append(token)
    return result

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrienbolens/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [147]:
res = preprocess(pages[0].text)

In [160]:
re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', 'aad')

<_sre.SRE_Match object; span=(0, 3), match='aad'>