In [1]:
from bs4 import BeautifulSoup
import bleach
import requests

In [2]:
#http://www.gyford.com/phil/writing/2015/03/25/wikipedia-parsing.php

class WikipediaFetcher(object):

    def fetch(self, page_name):
        """
        Passed a Wikipedia page's URL fragment, like
        'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
        main contents, tidy the HTML, strip out any elements we don't want
        and return the final HTML string.

        Returns a dict with two elements:
            'success' is either True or, if we couldn't fetch the page, False.
            'content' is the HTML if success==True, or else an error message.
        """
        result = self._get_html(page_name)
        
        if result['success']:
            result['content'] = self._tidy_html(result['content'])
            
        return result

    
    def _get_html(self, page_name):
        """
        Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
        the HTML content (not the entire HTML page) and returns it.

        Returns a dict with two elements:
            'success' is either True or, if we couldn't fetch the page, False.
            'content' is the HTML if success==True, or else an error message.
        """
        error_message = ''

        url = 'https://en.wikipedia.org/wiki/%s' % page_name

        try:
            response = requests.get(url, params={'action':'render'}, timeout=5)
        except requests.exceptions.ConnectionError as e:
            error_message = "Can't connect to domain."
        except requests.exceptions.Timeout as e:
            error_message = "Connection timed out."
        except requests.exceptions.TooManyRedirects as e:
            error_message = "Too many redirects."

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError as e:
            # 4xx or 5xx errors:
            error_message = "HTTP Error: %s" % response.status_code
        except NameError:
            if error_message == '':
                error_message = "Something unusual went wrong."

        if error_message:
            return {'success': False, 'content': error_message} 
        else:
            return {'success': True, 'content': response.text}

    def _tidy_html(self, html):
        """
        Passed the raw Wikipedia HTML, this returns valid HTML, with all
        disallowed elements stripped out.
        """
        #html = self._bleach_html(html)
        #html = self._strip_html(html)
        return html

    def _bleach_html(self, html):
        """
        Ensures we have valid HTML; no unclosed or mis-nested tags.
        Removes any tags and attributes we don't want to let through.
        Doesn't remove the contents of any disallowed tags.

        Pass it an HTML string, it'll return the bleached HTML string.
        """

        # Pretty much most elements, but no forms or audio/video.
        allowed_tags = [
            'a', 'abbr', 'acronym', 'address', 'area', 'article',
            'b', 'blockquote', 'br',
            'caption', 'cite', 'code', 'col', 'colgroup',
            'dd', 'del', 'dfn', 'div', 'dl', 'dt',
            'em',
            'figcaption', 'figure', 'footer',
            'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr',
            'i', 'img', 'ins',
            'kbd',
            'li',
            'map',
            'nav',
            'ol',
            'p', 'pre',
            'q',
            's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'sup',
            'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr',
            'ul',
            'var',
        ]

        # These attributes will be removed from any of the allowed tags.
        allowed_attributes = {
            '*':        ['class', 'id'],
            'a':        ['href', 'title'],
            'abbr':     ['title'],
            'acronym':  ['title'],
            'img':      ['alt', 'src', 'srcset'],
            # Ugh. Don't know why this page doesn't use .tright like others
            # http://127.0.0.1:8000/encyclopedia/5040/
            'table':    ['align'],
            'td':       ['colspan', 'rowspan'],
            'th':       ['colspan', 'rowspan', 'scope'],
        }

        return bleach.clean(html, tags=allowed_tags,
                                    attributes=allowed_attributes, strip=True)

    def _strip_html(self, html):
        """
        Takes out any tags, and their contents, that we don't want at all.
        And adds custom classes to existing tags (so we can apply CSS styles
        without having to multiply our CSS).

        Pass it an HTML string, it returns the stripped HTML string.
        """

        # CSS selectors. Strip these and their contents.
        selectors = [
            'div.hatnote',
            'div.navbar.mini', # Will also match div.mini.navbar
            # Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
            'div.topicon',
            'a.mw-headline-anchor',
        ]

        # Strip any element that has one of these classes.
        classes = [
            # "This article may be expanded with text translated from..."
            # https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
            'ambox-notice',
            'magnify',
            # eg audio on https://en.wikipedia.org/wiki/Bagpipes
            'mediaContainer',
            'navbox',
            'noprint',
        ]

        # Any element has a class matching a key, it will have the classes
        # in the value added.
        add_classes = {
            # Give these tables standard Bootstrap styles.
            'infobox':   ['table', 'table-bordered'],
            'ambox':     ['table', 'table-bordered'],
            'wikitable': ['table', 'table-bordered'],
        } 

        soup = BeautifulSoup(html)

        for selector in selectors:
            [tag.decompose() for tag in soup.select(selector)]

        for clss in classes:
            [tag.decompose() for tag in soup.find_all(attrs={'class':clss})]

        for clss, new_classes in add_classes.iteritems():
            for tag in soup.find_all(attrs={'class':clss}):
                tag['class'] = tag.get('class', []) + new_classes

        # Depending on the HTML parser BeautifulSoup used, soup may have
        # surrounding <html><body></body></html> or just <body></body> tags.
        if soup.body:
            soup = soup.body
        elif soup.html:
            soup = soup.html.body

        # Put the content back into a string.
        html = ''.join(str(tag) for tag in soup.contents)

        return html

In [7]:
article_list = ["Beta distribution", "Bayesian network", "Concentration parameter", "Conjugate prior", "Directed acyclic graph", "Dirichlet distribution", "Elastic energy", "Elastic map", "Elastic net regularization", "Expectation–maximization algorithm", "Exponential family", "Gibbs sampling", "Graphical model", "Joint probability distribution", "Mixture model", "Pólya urn model", "Posterior probability", "Statistical inference", "Sufficient statistic", "Tikhonov regularization", "Variational Bayesian methods", "Accelerated failure time model", "Akaike information criterion", "Arnoldi iteration", "Artificial intelligence", "Association rule learning", "Automatic summarization", "Auxiliary verb", "Bag of words model", "Bateman–Horn conjecture", "Bell state", "Bernstein–von Mises theorem", "Beta-binomial distribution", "Beta prime distribution", "BHT algorithm", "Binary Independence Model", "Binomial distribution", "Birthday problem", "Bit array", "Bitwise operation", "Bloch sphere", "Boosting_(machine_learning)", "Bootstrapping", "Brachistochrone curve", "Bregman divergence", "Brown clustering", "Bunyakovsky conjecture", "Calculus of variations", "Canonical correlation", "CAP theorem", "Catenary", "Censored regression model", "Censoring_(statistics)", "Chi squared distribution", "Chi squared test", "Chinese restaurant process", "Cluster analysis", "Cohen's kappa", "Collinearity", "Collision problem", "Color wheel graphs of complex functions", "Competitive learning", "Complex-valued function", "Conditional probability", "Conditional quantum entropy", "Condorcet method", "Confusion matrix", "Coprime integers", "Cosine similarity", "Cross-validation_(statistics)", "Cross entropy", "Cross entropy method", "Cumulative distribution function", "Data compression", "Data science", "Decision engineering", "Decision tree learning", "Decision tree model", "Density estimation", "Diagonalizable matrix", "Digamma function", "Dirac delta function", "Dirichlet process", "Dirichlet-multinomial_distribution", "Dirichlet's theorem on arithmetic progressions", "Document clustering", "Domain coloring", "Doob's martingale convergence theorems", "Eigendecomposition of a matrix", "Elementary matrix", "Empirical distribution function", "Ensemble learning", "Entropy", "Erlang distribution", "Exponential distribution", "Exponential function", "F1 score", "Facility location problem", "Feedforward neural network", "Fermat's factorization method", "Fibonacci prime", "Fieller's theorem", "Floor and ceiling functions", "Formula for primes", "Fuzzy clustering", "Gamma distribution", "Gamma function", "Gaussian integer", "Generalized minimal residual method", "Generalized second price auction", "Generalized Tobit", "Genetic drift", "Geodesic", "Gini coefficient", "Goal programming", "Goldbach's conjecture", "Gradient boosting", "Grammatical modifier", "Grover's algorithm", "Hash table", "Heuristic", "Hinge loss", "Holevo's theorem", "Huber loss", "Hypercube", "Hyperrectangle", "Image processing", "Infimum and supremum", "Information gain in decision trees", "Information gain ratio", "Information retrieval", "Information theory", "Inner product space", "Integrated circuit", "Integration by substitution", "Inverse function theorem", "Inverse transform sampling", "Inverted index", "Irreducible polynomial", "Isoperimetric inequality", "Jackson network", "Jacobian matrix and determinant", "Karmarkar's algorithm", "Kendall's notation", "Kernel method", "Kronecker delta", "Kullback Leibler divergence", "Lagrange multiplier", "Lagrangian relaxation", "Lasso (statistics)", "Latent Dirichlet allocation", "Latent semantic analysis", "Latent semantic indexing", "Least absolute deviations", "Least squares", "Lemmatisation", "Lexeme", "Lexical hypothesis", "Lexicographical order", "Likelihood function", "Lindley equation", "Linear congruential generator", "Log normal distribution", "Logarithmic integral function", "Logistic regression", "Loss function", "Loss functions for classification", "Machine learning", "Markov chain Monte Carlo", "Mathematical notation", "Matthews correlation coefficient", "Mean sojourn time", "Meta learning", "Metaheuristic", "Method of moments (statistics)", "Minimal surface", "Minimax", "Minimax Condorcet", "Modal verb", "Modular arithmetic", "Moran process", "morpheme", "Morphological derivation", "Multidimensional scaling", "Multilevel model", "Mutilated chessboard problem", "Naive Bayes classifier", "Negative binomial distribution", "Neutral vector", "No teleportation theorem", "Nonlinear dimensionality reduction", "Normal distribution", "Notation in probability and statistics", "Null hypothesis", "Occam's razor", "Okapi BM25", "Ontology", "Open set", "Optimal control", "Orthogonal transformation", "Overfitting", "PageRank", "Pairwise comparison", "Pareto distribution", "Part of speech", "pattern", "Piecewise linear function", "Pink noise", "Place and route", "Plateau's problem", "Pochhammer symbol", "Poisson distribution", "Poisson limit theorem", "Poisson process", "Positive definite kernel", "Power iteration", "Precision and recall", "Prime element", "Prime number theorem", "Principal component analysis", "Prior probability", "Probabilistic latent semantic analysis", "Probabilistic relevance model", "Probability Generating Function", "Probability integral transform", "Proportional hazards model", "Psychometrics", "QR algorithm", "Quantum annealing", "Quantum field theory", "Quantum gate", "Quantum information", "Quantum information science", "Quantum Monte Carlo", "Qubit", "Query optimization", "Queueing theory", "Quotient ring", "Radial basis function", "Random forest", "Random variate", "Ranking", "Real number", "Receiver operating characteristic", "Relevance", "Residual sum of squares", "Revised simplex method", "Riemann surface", "Riemann zeta function", "Robust statistics", "Root of unity", "Run length encoding", "Scheffé's method", "Schönhage Strassen algorithm", "Search engine indexing", "Segmented regression", "Self information", "Self organizing map", "Semantic search", "Semantics", "Semiparametric model", "Sentiment analysis", "Sigmoid function", "Simpson's paradox", "Simulated annealing", "Singular value decomposition", "Skewness", "Softmax function", "Solved game", "Spectral theorem", "Standard deviation", "Standard error", "Stationary point", "Statistical ensemble", "Statistical power", "Stochastic gradient descent", "Stop words", "Stretched exponential function", "Suffix tree", "Support vector machine", "Survival analysis", "Swanson's law", "Symmetric derivative", "Synonym ring", "Text corpus", "Tf–idf", "Tobit model", "Topic model", "Torus", "Travelling salesman problem", "Triangular distribution", "Trigamma function", "Truncated regression model", "Truncation", "Twin prime", "Type I and type II errors", "Ulam spiral", "Variance", "Vector quantization", "Von Neumann entropy", "Web crawler", "Web search engine", "Web search query", "Weibull distribution", "Zipf's law", "Dirichlet-multinomial distribution", "F-test of equality of variances", "K-means_clustering", "n-gram", "Rayleigh–Ritz method", "Student's t-test"]

document_set = {}

for article in article_list:
    myScraper = WikipediaFetcher()
    scraped_article = myScraper.fetch(page_name=article)
    if scraped_article['success'] == True:
        soup = BeautifulSoup(scraped_article['content'], 'html.parser')
        
        document_text = ""        
        get_tags = soup.find_all(['p'])        
        for tag in get_tags:    
            clean_text = (''.join(tag.findAll(text=True))).strip()            
            document_text = document_text + " " + clean_text    
        
        if len(document_text) > 250:
            document_set[article]  = document_text
        else:
            print "----------------------------------------------------------------"
            print article + " was too short: "
            print ""
            print document_text
            print "----------------------------------------------------------------"
            print ""
    else:
            print "----------------------------------------------------------------"
            print article + " failed to scrape"
            print "----------------------------------------------------------------"
            print ""

In [None]:
# get_rows = soup.find_all(['li', 'p'])

# section_text = {}

# current_section = "Wiki_Introduction"
# section_text[current_section] = ""

# for row in get_rows:    
#     potential_section = row.find('span', { "class" : "toctext"})

#     if potential_section is not None:
#         section = (''.join(potential_section.findAll(text=True))).strip()
#         section_text[section] = ""
#         current_section = section
        
#     else:
#         text = (''.join(row.findAll(text=True))).strip()
#         section_text[current_section] = section_text[current_section] + " " + text
        
# for key, value in section_text.items():
#     print key
#     print value
#     print "---------------------------------------------------------------------------------------------------------------------------------------------------------"

In [14]:
# Check for duplicates and shuffle order
import collections
from random import shuffle

dataset = document_set.values()
if len([item for item, count in collections.Counter(dataset).items() if count > 1]):
    print "Duplicates found..."
    dataset = list(set(dataset))
    shuffle(dataset)
else:
    shuffle(dataset)

Duplicates found...


In [21]:
import nltk
import re

# Custom tokenizer
class tokenize_custom(object):
    def __call__(self, doc):
        # First tokenize by sentence then by word
        tokens = [word for sent in nltk.sent_tokenize(doc) for word in nltk.word_tokenize(sent)]
        
        filtered_tokens = []
        # Filter out any tokens not containing letters (numeric tokens, raw punctuation)
        for token in tokens:
            if len(token) > 3 and re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)                
        return filtered_tokens

In [27]:
#http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

n_topics = 10
n_top_words = 15

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.9, min_df=2, tokenizer = tokenize_custom(), stop_words = 'english')
t0 = time()
tf = tf_vectorizer.fit_transform(dataset)
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, tokenizer = tokenize_custom(), stop_words = 'english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(dataset)
print("done in %0.3fs." % (time() - t0))

print("Fitting LDA")
t0 = time()
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100, learning_method='batch', random_state=1)
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting NMF")
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
exit()
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

Extracting tf features for LDA...
done in 14.797s.
Extracting tf-idf features for NMF...
done in 14.723s.
Fitting LDA
done in 58.448s.

Topics in LDA model:
Topic #0:
distribution probability function distributions random parameter variables prior mean parameters variable likelihood given value beta
Topic #1:
hypothesis test null number example condorcet false positive precision candidate order result used error power
Topic #2:
algorithm data problem algorithms used method clustering compression time pagerank methods solution cluster number optimal
Topic #3:
search words document documents word used language information text verbs example index term terms relevant
Topic #4:
data model models regression learning used training decision loss analysis linear variables statistical using logistic
Topic #5:
entropy standard information mean deviation variance sample distribution population delta used time value given measure
Topic #6:
gini income hash table used coefficient number values inde