In [3]:
import wikipediaapi
wiki_wiki = wikipediaapi.Wikipedia('en')

In [42]:
def get_categorymembers(category, level=0, max_level=0, verbose=False):
    """
    Return a list of all wikipedia pages from a given category.
    Categories are themselves pages -> The list includes pages from subcategories that
    are members of a higher level category (recursively).
    
    NOTE: duplicates are not removed.
    
    Args:
    ----
        category (str)
        level (int): current level of the category
        max_level (int): maximum level of the recursion for subcategories
        verbose (bool)
        
    Returns:
    -------
        pages (list): list of `page`.
    
    """
    
    pages = []
    members = category.categorymembers
    for page in members.values():
        info_str = "{0:s}: {1:s} (ns: {2:d})".format('*' * (level + 1), page.title, page.ns)
        if page.ns == wikipediaapi.Namespace.MAIN:
            pages.append(page)
            if verbose:
                print("{0:70.70}{1:>27}".format(info_str, 'PAGE ADDED'))
        if page.ns == wikipediaapi.Namespace.CATEGORY and level < max_level:
            if verbose:
                print("(SUBCATEGORY) ", info_str)
            get_categorymembers(page, level + 1)
    return pages

In [41]:
# cat = wiki_wiki.page("Category:Science fiction novels by writer")
cat = wiki_wiki.page("Category:Novels by Douglas Adams")
pages = get_categorymembers(cat, verbose=True)

*: Dirk Gently's Holistic Detective Agency (ns: 0)                                     PAGE ADDED
*: The Hitchhiker's Guide to the Galaxy (ns: 0)                                        PAGE ADDED
*: The Hitchhiker's Guide to the Galaxy (novel) (ns: 0)                                PAGE ADDED
*: Life, the Universe and Everything (ns: 0)                                           PAGE ADDED
*: The Long Dark Tea-Time of the Soul (ns: 0)                                          PAGE ADDED
*: Mostly Harmless (ns: 0)                                                             PAGE ADDED
*: The Restaurant at the End of the Universe (ns: 0)                                   PAGE ADDED
*: So Long, and Thanks for All the Fish (ns: 0)                                        PAGE ADDED


In [171]:
import re
import nltk
import gensim
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.parsing.preprocessing import STOPWORDS
from gensim import models, corpora

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bolensadrien/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/bolensadrien/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [71]:
print(len(set(STOPWORDS) - set(stopwords.words('english'))))
print(len(set(stopwords.words('english')) - set(STOPWORDS)))

211
53


In [82]:
STOPWORDS_ALL = set(STOPWORDS).union(set(stopwords.words('english')))
print(len(STOPWORDS_ALL))

390


In [168]:
def clean_text(text):
    tokens = word_tokenize(text.lower())
    clean_tokens = [t for t in tokens if (
        t not in STOPWORDS_ALL and
        re.match('[a-zA-Z\-]{3,}', t)
#         re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
    )]
    return clean_tokens

In [239]:
tokenized_data = []
for p in pages:
    tokenized_data.append(clean_text(p.text))

In [268]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 4th document looks like: [(word_id, count), ...]
# (only the first 10 words)
print(corpus[4][:10])

[(15, 2), (16, 4), (17, 1), (18, 1), (21, 1), (22, 1), (34, 1), (45, 1), (46, 1), (50, 1)]


In [269]:
NUM_TOPICS = 6

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [276]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 5))
    
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 5))
    
print("=" * 20)

LDA Model:
Topic #0: 0.014*"series" + 0.013*"adams" + 0.012*"guide" + 0.010*"arthur" + 0.009*"hitchhiker"
Topic #1: 0.013*"series" + 0.009*"adams" + 0.008*"book" + 0.008*"arthur" + 0.007*"radio"
Topic #2: 0.010*"arthur" + 0.009*"series" + 0.007*"adams" + 0.007*"radio" + 0.006*"universe"
Topic #3: 0.011*"series" + 0.010*"arthur" + 0.008*"radio" + 0.008*"hitchhiker" + 0.008*"adams"
Topic #4: 0.016*"series" + 0.011*"guide" + 0.009*"hitchhiker" + 0.009*"adams" + 0.008*"radio"
Topic #5: 0.011*"radio" + 0.008*"series" + 0.008*"book" + 0.007*"guide" + 0.006*"hitchhiker"
LSI Model:
Topic #0: 0.419*"series" + 0.312*"radio" + 0.280*"guide" + 0.258*"hitchhiker" + 0.223*"adams"
Topic #1: -0.317*"gently" + -0.289*"dirk" + -0.208*"adams" + -0.195*"macduff" + -0.182*"ghost"
Topic #2: -0.505*"arthur" + -0.171*"krikkit" + 0.164*"radio" + -0.153*"robots" + 0.146*"series"
Topic #3: -0.246*"universe" + -0.236*"krikkit" + 0.230*"earth" + -0.201*"robots" + 0.182*"fish"
Topic #4: 0.370*"zaphod" + 0.270*"zarn

In [277]:
text = pages[0].text
bow = dictionary.doc2bow(clean_text(text))
 
print(lsi_model[bow])
print(lda_model[bow])

[(0, 36.92539958217678), (1, -73.92019664694953), (2, 2.8143817661235544), (3, -0.9842033820394213), (4, 1.153303105860656), (5, 0.4581425670460037)]
[(1, 0.39767718), (4, 0.6018241)]


In [278]:
from gensim import similarities
 
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities)
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
# print(pages[document_id].text[:1000])
 

[(0, 0.99997854), (4, 0.8601553), (2, 0.12233368), (6, 0.027768016), (3, 0.020850312), (1, 0.0), (5, 0.0), (7, 0.0)]


In [279]:
lda_index[lda_model[bow]]

array([0.9993975 , 0.        , 0.12024248, 0.02183226, 0.84545153,
       0.        , 0.02907575, 0.        ], dtype=float32)

In [265]:
len(pages)

8

In [146]:
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
stemmer = PorterStemmer()
nltk.download('wordnet')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 2:
#             result.append(lemmatize_stemming(token))
            result.append(token)
    return result

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/bolensadrien/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [147]:
res = preprocess(pages[0].text)

In [160]:
re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', 'aad')

<_sre.SRE_Match object; span=(0, 3), match='aad'>