# Working with Text data

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# http://ai.stanford.edu/~amaas/data/sentiment/

## Example application: Sentiment analysis of movie reviews

In [None]:
!tree -L 2 data/aclImdb

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))
print("text_train[1]:\n{}".format(text_train[1]))

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

In [None]:
print("Samples per class (training):", np.bincount(y_train))

In [None]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data:", len(text_test))
print("Samples per class (test):", np.bincount(y_test))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

#### Applying bag-of-words to a toy dataset

In [None]:
bards_words = ["The fool doth think he is wise,",
               "but the wise man knows himself to be a fool"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words)

In [None]:
print("Vocabulary size:", len(vect.vocabulary_))
print("Vocabulary content:")
print(vect.vocabulary_)

In [None]:
bag_of_words = vect.transform(bards_words)
print("bag_of_words:", repr(bag_of_words))

In [None]:
print("Dense representation of bag_of_words:\n{}".format(
      bag_of_words.toarray()))

In [None]:
vect.get_feature_names()

In [None]:
vect.inverse_transform(bag_of_words)

### Bag-of-word for movie reviews

In [None]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names()
print("Number of features:", len(feature_names))
print("First 20 features:")
print(feature_names[:20])
print("Features 20010 to 20030:")
print(feature_names[20010:20030])
print("Every 2000th feature:")
print(feature_names[::2000])

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

In [None]:
X_test = vect.transform(text_test)
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df:", repr(X_train))

In [None]:
feature_names = vect.get_feature_names()

print("First 50 features:")
print(feature_names[:50])
print("Features 20010 to 20030:")
print(feature_names[20010:20030])
print("Every 700th feature:")
print(feature_names[::700])

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

### Stop-words

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words:", len(ENGLISH_STOP_WORDS))
print("Every 10th stopword:")
print(list(ENGLISH_STOP_WORDS)[::10])

In [None]:
# specifying stop_words="english" uses the build-in list.
# We could also augment it and pass our own.
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words:")
print(repr(X_train))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

### Rescaling the data with TFIDF
\begin{equation*}
\text{tfidf}(w, d) = \text{tf} \log\big(\frac{N + 1}{N_w + 1}\big) + 1
\end{equation*}

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5),
                     LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

In [None]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
# transform the training dataset:
X_train = vectorizer.transform(text_train)
# find maximum value for each of the features over dataset:
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# get feature names
feature_names = np.array(vectorizer.get_feature_names())

print("Features with lowest tfidf:")
print(feature_names[sorted_by_tfidf[:20]])

print("Features with highest tfidf:")
print(feature_names[sorted_by_tfidf[-20:]])

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print("Features with lowest idf:")
print(feature_names[sorted_by_idf[:100]])

#### Investigating model coefficients

In [None]:
def visualize_coefficients(coefficients, feature_names, n_top_features=25):
    """Visualize coefficients of a linear model.

    Parameters
    ----------
    coefficients : nd-array, shape (n_features,)
        Model coefficients.

    feature_names : list or nd-array of strings, shape (n_features,)
        Feature names for labeling the coefficients.

    n_top_features : int, default=25
        How many features to show. The function will show the largest (most
        positive) and smallest (most negative)  n_top_features coefficients,
        for a total of 2 * n_top_features coefficients.
    """
    coefficients = coefficients.squeeze()
    if coefficients.ndim > 1:
        # this is not a row or column vector
        raise ValueError("coeffients must be 1d array or column vector, got"
                         " shape {}".format(coefficients.shape))
    coefficients = coefficients.ravel()

    if len(coefficients) != len(feature_names):
        raise ValueError("Number of coefficients {} doesn't match number of"
                         "feature names {}.".format(len(coefficients),
                                                    len(feature_names)))
    # get coefficients with large absolute values
    coef = coefficients.ravel()
    positive_coefficients = np.argsort(coef)[-n_top_features:]
    negative_coefficients = np.argsort(coef)[:n_top_features]
    interesting_coefficients = np.hstack([negative_coefficients,
                                          positive_coefficients])
    # plot them
    plt.figure(figsize=(15, 5))
    colors = ['red' if c < 0 else 'blue'
              for c in coef[interesting_coefficients]]
    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients],
            color=colors)
    feature_names = np.array(feature_names)
    plt.subplots_adjust(bottom=0.3)
    plt.xticks(np.arange(1, 1 + 2 * n_top_features),
               feature_names[interesting_coefficients], rotation=60,
               ha="right")
    plt.ylabel("Coefficient magnitude")
    plt.xlabel("Feature")

In [None]:
plt.figure(figsize=(20, 5), dpi=300)
visualize_coefficients(
    grid.best_estimator_.named_steps["logisticregression"].coef_,
    feature_names, n_top_features=40)

# Bag of words with more than one word (n-grams)

In [None]:
print("bards_words:")
print(bards_words)

In [None]:
cv = CountVectorizer(ngram_range=(1, 1)).fit(bards_words)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names())

In [None]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(bards_words)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names())

In [None]:
print("Transformed data (dense):\n{}".format(cv.transform(bards_words).toarray()))

In [None]:
cv = CountVectorizer(ngram_range=(1, 3)).fit(bards_words)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names())

In [None]:
pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())
# running the grid-search takes a long time because of the
# relatively large grid and the inclusion of trigrams
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10, 100],
              "tfidfvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters:")
print(grid.best_params_)

In [None]:
len(CountVectorizer().fit(text_train).get_feature_names())

In [None]:
len(CountVectorizer(min_df=5).fit(text_train).get_feature_names())

In [None]:
len(CountVectorizer(ngram_range=(1, 2)).fit(text_train).get_feature_names())

In [None]:
len(CountVectorizer(ngram_range=(1, 2), min_df=5).fit(text_train).get_feature_names())

In [None]:
len(CountVectorizer(ngram_range=(1, 2), min_df=5, stop_words="english").fit(text_train).get_feature_names())

In [None]:
# extract scores from grid_search
scores = grid.cv_results_['mean_test_score'].reshape(-1, 3).T
# visualize heatmap
heatmap = mglearn.tools.heatmap(
    scores, xlabel="C", ylabel="ngram_range", cmap="viridis", fmt="%.3f",
    xticklabels=param_grid['logisticregression__C'],
    yticklabels=param_grid['tfidfvectorizer__ngram_range'])
plt.colorbar(heatmap)

In [None]:
# extract feature names and coefficients
vect = grid.best_estimator_.named_steps['tfidfvectorizer']
feature_names = np.array(vect.get_feature_names())
coef = grid.best_estimator_.named_steps['logisticregression'].coef_
mglearn.tools.visualize_coefficients(coef, feature_names, n_top_features=40)
plt.ylim(-22, 22)

In [None]:
# find 3-gram features
mask = np.array([len(feature.split(" ")) for feature in feature_names]) == 3
# visualize only 3-gram features:
mglearn.tools.visualize_coefficients(coef.ravel()[mask],
                                     feature_names[mask], n_top_features=40)
plt.ylim(-22, 22)

# Exercise
Compare unigram and bigram models on the 20 newsgroup dataset

In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
remove = ('headers', 'footers', 'quotes')

data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

In [None]:
data_train.data[0]