In [1]:
%matplotlib inline
from preamble import *

In [2]:
%%time
import mglearn

Wall time: 1e+03 µs


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Working with Text Data

### Types of data represented as strings
#### Example application: Sentiment analysis of movie reviews

In [None]:
!tree -dL 2 data/aclImdb

In [None]:
!rm -r data/aclImdb/train/unsup

### This process takes Wall time: approximately 21min 26s.  Go get some coffee

In [14]:
%%time
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[6]:\n{}".format(text_train[6]))

type of text_train: <type 'list'>
length of text_train: 75000
text_train[6]:
Gloomy Sunday - Ein Lied von Liebe und Tod directed by Rolf Schübel in 1999 is a romantic, absorbing, beautiful, and heartbreaking movie. It started like Jules and Jim; it ended as one of Agatha Christie's books, and in between it said something about love, friendship, devotion, jealousy, war, Holocaust, dignity, and betrayal, and it did better than The Black Book which is much more popular. It is not perfect, and it made me, a cynic, wonder in the end on the complexity of the relationships and sensational revelations, and who is who to whom but the movie simply overwhelmed me. Perfect or not, it is unforgettable. All four actors as the parts of the tragic not even a triangle but a rectangle were terrific. I do believe that three men could fell deeply for one girl as beautiful and dignified as Ilona in a star-making performance by young Hungarian actress Erica Marozsán and who would not? The titular song is ha

In [15]:
%%time 
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

Wall time: 226 ms


In [16]:
np.unique(y_train)

array([0, 1, 2])

In [17]:
print("Samples per class (training): {}".format(np.bincount(y_train)))

Samples per class (training): [12500 12500 50000]


### This process takes Wall time: 10min 26s. Go get some coffee

In [18]:
%%time
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

Number of documents in test data: 25000
Samples per class (test): [12500 12500]
Wall time: 4min 12s


### Representing text data as Bag of Words

![bag_of_words](images/bag_of_words.png)

#### Applying bag-of-words to a toy dataset

In [19]:
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

In [20]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words)

Wall time: 1e+03 µs


In [21]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

Vocabulary size: 13
Vocabulary content:
 {u'fool': 3, u'be': 0, u'he': 4, u'himself': 5, u'wise': 12, u'knows': 7, u'is': 6, u'but': 1, u'to': 11, u'the': 9, u'doth': 2, u'think': 10, u'man': 8}


In [22]:
bag_of_words = vect.transform(bards_words)
print("bag_of_words: {}".format(repr(bag_of_words)))

bag_of_words: <2x13 sparse matrix of type '<type 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>


In [23]:
print("Dense representation of bag_of_words:\n{}".format(
    bag_of_words.toarray()))

Dense representation of bag_of_words:
[[0 0 1 1 1 0 1 0 0 1 1 0 1]
 [1 1 0 1 0 1 0 1 1 1 0 1 1]]


### Bag-of-word for movie reviews

### This process takes approx: Wall time: 41.4 s

In [24]:
%%time
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

X_train:
<75000x124255 sparse matrix of type '<type 'numpy.int64'>'
	with 10315542 stored elements in Compressed Sparse Row format>
Wall time: 35.3 s


In [25]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

Number of features: 124255
First 20 features:
[u'00', u'000', u'0000', u'0000000000000000000000000000000001', u'0000000000001', u'000000001', u'000000003', u'00000001', u'000001745', u'00001', u'0001', u'00015', u'0002', u'0007', u'00083', u'000ft', u'000s', u'000th', u'001', u'002']
Features 20010 to 20030:
[u'cheapen', u'cheapened', u'cheapening', u'cheapens', u'cheaper', u'cheapest', u'cheapie', u'cheapies', u'cheapjack', u'cheaply', u'cheapness', u'cheapo', u'cheapozoid', u'cheapquels', u'cheapskate', u'cheapskates', u'cheapy', u'chearator', u'cheat', u'cheata']
Every 2000th feature:
[u'00', u'_require_', u'aideed', u'announcement', u'asteroid', u'banqui\xe8re', u'besieged', u'bollwood', u'btvs', u'carboni', u'chcialbym', u'clotheth', u'consecration', u'cringeful', u'deadness', u'devagan', u'doberman', u'duvall', u'endocrine', u'existent', u'fetiches', u'formatted', u'garard', u'godlie', u'gumshoe', u'heathen', u'honor\xe9', u'immatured', u'interested', u'jewelry', u'kerchner', u'k

### This process takes approx: Wall time: 37min 40s  ==> Hurry up and just wait!!!

In [26]:
%%time
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.71
Wall time: 31min 54s


## This process takes approx: Wall time:  approximately 1h 23min 31s==> Hurry up and just wait!!!

In [27]:
%%time
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

Best cross-validation score: 0.72
('Best parameters: ', {'C': 0.1})
Wall time: 1h 17min 51s


### This process takes approx: Last it took Wall time: 48.1 s ==> Hurry up and just wait!!!

In [28]:
%%time
X_test = vect.transform(text_test)
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

Test score: 0.13
Wall time: 6.06 s


### This process takes approx: Last time it took Wall time: 2min 15s ==> Hurry up and just wait!!!

In [29]:
%%time
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df: {}".format(repr(X_train)))

X_train with min_df: <75000x44532 sparse matrix of type '<type 'numpy.int64'>'
	with 10191240 stored elements in Compressed Sparse Row format>
Wall time: 36.3 s


In [30]:
feature_names = vect.get_feature_names()

print("First 50 features:\n{}".format(feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(feature_names[::700]))

First 50 features:
[u'00', u'000', u'001', u'007', u'00am', u'00pm', u'00s', u'01', u'02', u'03', u'04', u'05', u'06', u'07', u'08', u'09', u'10', u'100', u'1000', u'1001', u'100k', u'100th', u'100x', u'101', u'101st', u'102', u'103', u'104', u'105', u'106', u'107', u'108', u'109', u'10am', u'10pm', u'10s', u'10th', u'10x', u'11', u'110', u'1100', u'110th', u'111', u'112', u'1138', u'115', u'116', u'117', u'11pm', u'11th']
Features 20010 to 20030:
[u'inert', u'inertia', u'inescapable', u'inescapably', u'inevitability', u'inevitable', u'inevitably', u'inexcusable', u'inexcusably', u'inexhaustible', u'inexistent', u'inexorable', u'inexorably', u'inexpensive', u'inexperience', u'inexperienced', u'inexplicable', u'inexplicably', u'inexpressive', u'inextricably']
Every 700th feature:
[u'00', u'accountability', u'alienate', u'appetite', u'austen', u'battleground', u'bitten', u'bowel', u'burton', u'cat', u'choreographing', u'collide', u'constipation', u'creatively', u'dashes', u'descended', u

### This process takes approx: Last time it took Wall time: 1 hr 15min 41s ==> Hurry up and just wait!!!

In [31]:
%%time
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.72
Wall time: 1h 13min 52s


### Stop-words

In [32]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

Number of stop words: 318
Every 10th stopword:
['all', 'not', 'one', 'should', 'latterly', 'cannot', 'name', 'each', 'ten', 'beyond', 'mine', 'between', 'full', 'found', 'anything', 'became', 'formerly', 'everyone', 'three', 'anyone', 'was', 'becoming', 'he', 'besides', 'and', 'an', 'fill', 'when', 'becomes', 'hereupon', 'whereby', 'rather']


### This process takes approx: Last time it took Wall time: 36.4 s ==> Hurry up and just wait!!!

In [33]:
%%time
# Specifying stop_words="english" uses the built-in list.
# We could also augment it and pass our own.
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words:\n{}".format(repr(X_train)))

X_train with stop words:
<75000x44223 sparse matrix of type '<type 'numpy.int64'>'
	with 6577418 stored elements in Compressed Sparse Row format>
Wall time: 39 s


### This process takes approx: Last time it took Wall time: 30min 54s ==> Hurry up and just wait!!!

In [34]:
%%time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.72
Wall time: 31min 51s


### Rescaling the Data with tf-idf
\begin{equation*}
\text{tfidf}(w, d) = \text{tf} \log\big(\frac{N + 1}{N_w + 1}\big) + 1
\end{equation*}

### This process takes approx: Last time it took Wall time: approximately 57 min 32 s ==> Hurry up and just wait!!!

In [35]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
                     LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best cross-validation score: 0.72
Wall time: 57min 32s


#### This process takes approx: Last time it took Wall time: 19.6 s ==> Hurry up and just wait!!!¶

In [36]:
%%time
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset:
X_train = vectorizer.transform(text_train)
# find maximum value for each of the features over dataset:
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())

print("Features with lowest tfidf:\n{}".format(
      feature_names[sorted_by_tfidf[:20]]))

print("Features with highest tfidf: \n{}".format(
      feature_names[sorted_by_tfidf[-20:]]))

Features with lowest tfidf:
[u'remained' u'acclaimed' u'combines' u'rapidly' u'uniformly' u'diverse'
 u'avoiding' u'fills' u'feeble' u'admired' u'wherever' u'admission'
 u'abound' u'starters' u'assure' u'pivotal' u'comprehend' u'deliciously'
 u'strung' u'inadvertently']
Features with highest tfidf: 
[u'nukie' u'reno' u'dominick' u'taz' u'ling' u'rob' u'victoria' u'turtles'
 u'khouri' u'lorenzo' u'id' u'zizek' u'elwood' u'nikita' u'rishi' u'timon'
 u'titanic' u'zohan' u'pammy' u'godzilla']
Wall time: 18.9 s


#### This process takes approx: Last time it took Wall time: 11 ms ==> Quick

In [37]:
%%time
sorted_by_idf = np.argsort(vectorizer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_idf[:100]]))

Features with lowest idf:
[u'the' u'and' u'of' u'to' u'this' u'is' u'it' u'in' u'that' u'but' u'for'
 u'with' u'was' u'as' u'on' u'movie' u'not' u'one' u'be' u'have' u'are'
 u'film' u'you' u'all' u'at' u'an' u'by' u'from' u'so' u'like' u'who'
 u'there' u'they' u'his' u'if' u'out' u'just' u'about' u'he' u'or' u'has'
 u'what' u'some' u'can' u'good' u'when' u'more' u'up' u'time' u'very'
 u'even' u'only' u'no' u'see' u'would' u'my' u'story' u'really' u'which'
 u'well' u'had' u'me' u'than' u'their' u'much' u'were' u'get' u'other'
 u'do' u'been' u'most' u'also' u'into' u'don' u'her' u'first' u'great'
 u'how' u'made' u'people' u'will' u'make' u'because' u'way' u'could' u'bad'
 u'we' u'after' u'them' u'too' u'any' u'then' u'movies' u'watch' u'she'
 u'think' u'seen' u'acting' u'its' u'characters']
Wall time: 8 ms


#### Investigating model coefficients

In [38]:
mglearn.tools.visualize_coefficients(
    grid.best_estimator_.named_steps["logisticregression"].coef_,
    feature_names, n_top_features=40)

ValueError: coeffients must be 1d array or column vector, got shape (3L, 44532L)

#### Bag of words with more than one word (n-grams)

In [39]:
print("bards_words:\n{}".format(bards_words))

bards_words:
['The fool doth think he is wise,', 'but the wise man knows himself to be a fool']


#### This process takes approx: Last time it took Wall time: 7 ms  ==> Hurry up and just wait!!!¶

In [40]:
%%time
cv = CountVectorizer(ngram_range=(1, 1)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

Vocabulary size: 13
Vocabulary:
[u'be', u'but', u'doth', u'fool', u'he', u'himself', u'is', u'knows', u'man', u'the', u'think', u'to', u'wise']
Wall time: 7 ms


#### This process takes approx: Last time it took Wall time: XX s ==> Hurry up and just wait!!!¶

In [41]:
%%time
cv = CountVectorizer(ngram_range=(2, 2)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

Vocabulary size: 14
Vocabulary:
[u'be fool', u'but the', u'doth think', u'fool doth', u'he is', u'himself to', u'is wise', u'knows himself', u'man knows', u'the fool', u'the wise', u'think he', u'to be', u'wise man']
Wall time: 7 ms


In [42]:
print("Transformed data (dense):\n{}".format(cv.transform(bards_words).toarray()))

Transformed data (dense):
[[0 0 1 1 1 0 1 0 0 1 0 1 0 0]
 [1 1 0 0 0 1 0 1 1 0 1 0 1 1]]


#### This process takes approx: Last time it took Wall time: 8 ms ==>  Quick!!¶

In [43]:
%%time
cv = CountVectorizer(ngram_range=(1, 3)).fit(bards_words)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

Vocabulary size: 39
Vocabulary:
[u'be', u'be fool', u'but', u'but the', u'but the wise', u'doth', u'doth think', u'doth think he', u'fool', u'fool doth', u'fool doth think', u'he', u'he is', u'he is wise', u'himself', u'himself to', u'himself to be', u'is', u'is wise', u'knows', u'knows himself', u'knows himself to', u'man', u'man knows', u'man knows himself', u'the', u'the fool', u'the fool doth', u'the wise', u'the wise man', u'think', u'think he', u'think he is', u'to', u'to be', u'to be fool', u'wise', u'wise man', u'wise man knows']
Wall time: 8 ms


#### This process takes approx: Last time it took Wall time: XX s ==> Hurry up and just wait!!!¶

In [None]:
%%time
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())
# running the grid-search takes a long time because of the
# relatively large grid and the inclusion of trigrams
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
              "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

In [None]:
%%time
# extract scores from grid_search
scores = grid.cv_results_['mean_test_score'].reshape(-1, 3).T
# visualize heat map
heatmap = mglearn.tools.heatmap(
    scores, xlabel="C", ylabel="ngram_range", cmap="viridis", fmt="%.3f",
    xticklabels=param_grid['logisticregression__C'],
    yticklabels=param_grid['tfidfvectorizer__ngram_range'])
plt.colorbar(heatmap)

In [None]:
%%time
# extract feature names and coefficients
vect = grid.best_estimator_.named_steps['tfidfvectorizer']
feature_names = np.array(vect.get_feature_names())
coef = grid.best_estimator_.named_steps['logisticregression'].coef_
mglearn.tools.visualize_coefficients(coef, feature_names, n_top_features=40)
plt.ylim(-22, 22)

In [None]:
%%time
# find 3-gram features
mask = np.array([len(feature.split(" ")) for feature in feature_names]) == 3
# visualize only 3-gram features
mglearn.tools.visualize_coefficients(coef.ravel()[mask],
                                     feature_names[mask], n_top_features=40)
plt.ylim(-22, 22)

#### Advanced tokenization, stemming and lemmatization

#### This process takes approx: Last time it took Wall time: 12.5 s 

In [4]:
%%time
import spacy
import nltk

# load spacy's English-language models
en_nlp = spacy.load('en')
# instantiate nltk's Porter stemmer
stemmer = nltk.stem.PorterStemmer()

# define function to compare lemmatization in spacy with stemming in nltk
def compare_normalization(doc):
    # tokenize document in spacy
    doc_spacy = en_nlp(doc)
    # print lemmas found by spacy
    print("Lemmatization:")
    print([token.lemma_ for token in doc_spacy])
    # print tokens found by Porter stemmer
    print("Stemming:")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

Wall time: 12.5 s


In [5]:
compare_normalization(u"Our meeting today was worse than yesterday, "
                       "I'm scared of meeting the clients tomorrow.")

Lemmatization:
[u'-PRON-', u'meeting', u'today', u'be', u'bad', u'than', u'yesterday', u',', u'-PRON-', u'be', u'scared', u'of', u'meet', u'the', u'client', u'tomorrow', u'.']
Stemming:
[u'our', u'meet', u'today', u'wa', u'wors', u'than', u'yesterday', u',', u'i', u'am', u'scare', u'of', u'meet', u'the', u'client', u'tomorrow', u'.']


In [7]:
%%time
# Technicallity: we want to use the regexp based tokenizer
# that is used by CountVectorizer  and only use the lemmatization
# from SpaCy. To this end, we replace en_nlp.tokenizer (the SpaCy tokenizer)
# with the regexp based tokenization
import re
from sklearn.feature_extraction.text import CountVectorizer
# regexp used in CountVectorizer:
regexp = re.compile('(?u)\\b\\w\\w+\\b')

# load spacy language model and save old tokenizer
en_nlp = spacy.load('en')
old_tokenizer = en_nlp.tokenizer
# replace the tokenizer with the preceding regexp
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(
    regexp.findall(string))

# create a custom tokenizer using the SpaCy document processing pipeline
# (now using our own tokenizer)
def custom_tokenizer(document):
    doc_spacy = en_nlp(document, entity=False, parse=False)
    return [token.lemma_ for token in doc_spacy]

# define a count vectorizer with the custom tokenizer
lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)

Wall time: 739 ms


### This process takes approx: Last time it took Wall time: XX min XX s ==> Hurry up and just wait!!!

In [None]:
%%time
# transform text_train using CountVectorizer with lemmatization
X_train_lemma = lemma_vect.fit_transform(text_train)
print("X_train_lemma.shape: {}".format(X_train_lemma.shape))

# standard CountVectorizer for reference
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train.shape: {}".format(X_train.shape))

### This process takes approx: Last time it took Wall time: XX min XX s ==> Hurry up and just wait!!!

In [None]:
%%time
# build a grid-search using only 1% of the data as training set:
from sklearn.model_selection import StratifiedShuffleSplit

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.99,
                            train_size=0.01, random_state=0)
grid = GridSearchCV(LogisticRegression(), param_grid, cv=cv)
# perform grid search with standard CountVectorizer
grid.fit(X_train, y_train)
print("Best cross-validation score "
      "(standard CountVectorizer): {:.3f}".format(grid.best_score_))
# perform grid search with Lemmatization
grid.fit(X_train_lemma, y_train)
print("Best cross-validation score "
      "(lemmatization): {:.3f}".format(grid.best_score_))

### Topic Modeling and Document Clustering
#### Latent Dirichlet Allocation

In [None]:
%%time
vect = CountVectorizer(max_features=10000, max_df=.15)
X = vect.fit_transform(text_train)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
import numpy as np
x = np.array([0, 1, 4, 5, 6, 8, 8, 9, 11, 10, 19, 19, 2, 19, 7, 3])
for i in [75, 90, 95, 99]:
    print i, np.percentile(x,i)

### This process takes approx: Last time it took Wall time: XX min XX s ==> Hurry up and just wait!!!

In [None]:
%%time
import pandas as pd
url = "https://www.fdic.gov/bank/individual/failed/banklist.html"
#dfs = pd.read_html(url)
#print dfs


import wget
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
dfs = wget.download(url)
print dfs

### This process takes approx: Last time it took Wall time: XX min XX s ==> Hurry up and just wait!!!

In [None]:
%%time
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10, learning_method="batch",
                                max_iter=25, random_state=0)
# be build the model and transform the data in one step
# computing transform takes some time,
# and we can save time by doing both at once.
document_topics = lda.fit_transform(X)

In [None]:
print("lda.components_.shape: {}".format(lda.components_.shape))

In [None]:
%%time
# for each topic (a row in the components_), sort the features (ascending).
# Invert rows with [:, ::-1] to make sorting descending
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
# get the feature names from the vectorizer:
feature_names = np.array(vect.get_feature_names())

In [None]:
%%time
# Print out the 10 topics:
mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=10)

In [None]:
%%time
lda100 = LatentDirichletAllocation(n_topics=100, learning_method="batch",
                                   max_iter=25, random_state=0)
document_topics100 = lda100.fit_transform(X)

In [None]:
topics = np.array([7, 16, 24, 25, 28, 36, 37, 41, 45, 51, 53, 54, 63, 89, 97])

In [None]:
%%time
sorting = np.argsort(lda100.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
mglearn.tools.print_topics(topics=topics, feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=20)

In [None]:
%%time
# sort by weight of "music" topic 45
music = np.argsort(document_topics100[:, 45])[::-1]
# print the five documents where the topic is most important
for i in music[:10]:
    # show first two sentences
    print(b".".join(text_train[i].split(b".")[:2]) + b".\n")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 10))
topic_names = ["{:>2} ".format(i) + " ".join(words)
               for i, words in enumerate(feature_names[sorting[:, :2]])]
# two column bar chart:
for col in [0, 1]:
    start = col * 50
    end = (col + 1) * 50
    ax[col].barh(np.arange(50), np.sum(document_topics100, axis=0)[start:end])
    ax[col].set_yticks(np.arange(50))
    ax[col].set_yticklabels(topic_names[start:end], ha="left", va="top")
    ax[col].invert_yaxis()
    ax[col].set_xlim(0, 2000)
    yax = ax[col].get_yaxis()
    yax.set_tick_params(pad=130)
plt.tight_layout()

### Summary and Outlook