In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
#plt.rcParams["figure.dpi"] = 300
plt.rcParams["savefig.dpi"] = 300
plt.rcParams["savefig.bbox"] = "tight"

np.set_printoptions(precision=3, suppress=True)
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler

In [None]:
!tree -dL 2 aclImdb

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files("aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_trainval, y_trainval = reviews_train.data, reviews_train.target
print("type of text_trainval: {}".format(type(text_trainval)))
print("length of text_trainval: {}".format(len(text_trainval)))
print("text_trainval[1]:\n{}".format(text_trainval[1].decode()))

In [None]:
print(text_train[11451].decode())

In [None]:
print(text_train[16019].decode())

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

# Get some data from european parliament

In [None]:
import xml.etree.ElementTree as ET

In [None]:
import requests

In [None]:
response = requests.get("http://www.europarl.europa.eu/meps/en/xml.html?query=full&filter=all")

In [None]:
data_xml = ET.fromstring(response.text)

In [None]:
members_xml = data_xml.getchildren()

In [None]:
members_dict = [{i.tag: i.text for i in member} for member in members_xml]
members = pd.DataFrame(members_dict)

In [None]:
members.head()

In [None]:
malory = ["Do you want ants?",
          "Because that’s how you get ants."]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(malory)
print(vect.get_feature_names())

In [None]:
X = vect.transform(malory)
X

In [None]:
X.toarray()

In [None]:
print(malory)
print(vect.inverse_transform(X)[0])
print(vect.inverse_transform(X)[1])

# Classification example

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files("aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_trainval, y_trainval = reviews_train.data, reviews_train.target
print("type of text_trainval: {}".format(type(text_trainval)))
print("length of text_trainval: {}".format(len(text_trainval)))
print("text_trainval[1]:\n{}".format(text_trainval[1].decode()))

In [None]:
text_trainval = [doc.replace(b"<br />", b" ") for doc in text_trainval]

text_train, text_val, y_train, y_val = train_test_split(
    text_trainval, y_trainval, stratify=y_trainval, random_state=0)
vect = CountVectorizer()
X_train = vect.fit_transform(text_train)
X_val = vect.transform(text_val)

In [None]:
X_train

In [None]:
feature_names = vect.get_feature_names()
print(feature_names[:10])
print(feature_names[20000:20020])
print(feature_names[::2000])

In [None]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV().fit(X_train, y_train)

In [None]:
lr.C_

In [None]:
lr.score(X_val, y_val)

In [None]:
def plot_important_features(coef, feature_names, top_n=20, ax=None, rotation=60):
    if ax is None:
        ax = plt.gca()
    inds = np.argsort(coef)
    low = inds[:top_n]
    high = inds[-top_n:]
    important = np.hstack([low, high])
    myrange = range(len(important))
    colors = ['red'] * top_n + ['blue'] * top_n
    
    ax.bar(myrange, coef[important], color=colors)
    ax.set_xticks(myrange)
    ax.set_xticklabels(feature_names[important], rotation=rotation, ha="right")
    ax.set_xlim(-.7, 2 * top_n)
    ax.set_frame_on(False)

In [None]:
plt.figure(figsize=(15, 6))
plot_important_features(lr.coef_.ravel(), np.array(feature_names), top_n=20, rotation=40)
ax = plt.gca()
plt.savefig("images/coefficients.png")


# Vectorization options

In [None]:
print(vect.token_pattern)

In [None]:
vect = CountVectorizer(token_pattern=r"\b\w+\b")
vect.fit(malory)
print(vect.get_feature_names())

In [None]:
vect = CountVectorizer(token_pattern=r"\b\w[\w’]+\b")
# not actually an apostroph but some unicode pattern
# because I copy & pasted the quote
vect.fit(malory)
print(vect.get_feature_names())

In [None]:
vect = CountVectorizer(stop_words='english')
vect.fit(malory)
print(vect.get_feature_names())

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(list(ENGLISH_STOP_WORDS))

In [None]:
"well" in ENGLISH_STOP_WORDS

In [None]:
vect = CountVectorizer(min_df=2)
vect.fit(malory)
print(vect.get_feature_names())

In [None]:
vect = CountVectorizer(max_features=4)
vect.fit(malory)
print(vect.get_feature_names())

In [None]:
vect = CountVectorizer(min_df=2)
X_train_df2 = vect.fit_transform(text_train)
X_val_df2 = vect.transform(text_val)
print(X_train.shape)
print(X_train_df2.shape)

In [None]:
vect = CountVectorizer(min_df=4)
X_train_df4 = vect.fit_transform(text_train)
X_val_df4 = vect.transform(text_val)
print(X_train.shape)
print(X_train_df2.shape)
print(X_train_df4.shape)

In [None]:
lr = LogisticRegressionCV().fit(X_train_df4, y_train)

In [None]:
lr.C_

In [None]:
lr.score(X_val_df4, y_val)

# n-grams

In [None]:
cv = CountVectorizer(ngram_range=(1, 1)).fit(malory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(malory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
cv = CountVectorizer(ngram_range=(1, 2)).fit(malory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
for ngram_range in [(1, 1), (1, 2), (1, 3), (1, 4)]:
    
    cv = CountVectorizer(ngram_range=ngram_range, min_df=4).fit(text_train)
    print("Vocabulary size {} (min_df=4): {}".format(ngram_range, len(cv.vocabulary_)))

In [None]:
cv = CountVectorizer(ngram_range=(1, 4)).fit(text_train_sub)
print("Vocabulary size 1-4gram: {}".format(len(cv.vocabulary_)))

In [None]:
cv = CountVectorizer(ngram_range=(1, 2), min_df=4).fit(text_train_sub)
print("Vocabulary size (1, 2), min_df=4: {}".format(len(cv.vocabulary_)))
cv = CountVectorizer(ngram_range=(1, 2), min_df=4, stop_words="english").fit(text_train_sub)
print("Vocabulary size (1, 2), stopwords, min_df=4: {}".format(len(cv.vocabulary_)))

In [None]:
cv4 = CountVectorizer(ngram_range=(4, 4), min_df=4).fit(text_train)
cv4sw = CountVectorizer(ngram_range=(4, 4), min_df=4, stop_words="english").fit(text_train)
print(len(cv4.get_feature_names()))
print(len(cv4sw.get_feature_names()))

In [None]:
print(cv4.get_feature_names()[::1000])

In [None]:
print(cv4sw.get_feature_names()[::10])

In [None]:
bla = cv4sw.transform(text_train)

In [None]:
print(np.array(cv4sw.get_feature_names())[np.argsort(np.array(bla.sum(axis=0)).ravel())[::-1][:50]])

In [None]:
pd.Series("".join(cv4sw.get_feature_names()).split()).value_counts()[:10]

In [None]:
vect3 = CountVectorizer(ngram_range=(1, 3), min_df=4)
X_train3 = vect3.fit_transform(text_train)

In [None]:
lr3 = LogisticRegressionCV().fit(X_train3, y_train)

In [None]:
lr3.C_

In [None]:
X_val3 = vect3.transform(text_val)
lr3.score(X_val3, y_val)

In [None]:
plt.figure(figsize=(15, 4))
plot_important_features(lr3.coef_.ravel(), np.array(vect3.get_feature_names()), top_n=40, rotation=70)
plt.title("Stopwords included (1-3 gram)")
plt.savefig("images/stopwords_1.png")

In [None]:
vect3sw = CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words='english')
X_train3sw = vect3sw.fit_transform(text_train)
lr3sw = LogisticRegressionCV().fit(X_train3sw, y_train)
X_val3sw = vect3sw.transform(text_val)
lr3sw.score(X_val3sw, y_val)

In [None]:
lr.C_

In [None]:
plt.figure(figsize=(15, 4))
plot_important_features(lr3sw.coef_.ravel(), np.array(vect3sw.get_feature_names()), top_n=40)
plt.title("Stopwords excluded (1-3 gram)")
plt.savefig("images/stopwords_2.png")

In [None]:
my_stopwords = set(ENGLISH_STOP_WORDS)
my_stopwords.remove("well")
my_stopwords.remove("not")
my_stopwords.add("ve")

In [None]:
vect3msw = CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words=my_stopwords)
X_train3msw = vect3msw.fit_transform(text_train)
lr3msw = LogisticRegressionCV().fit(X_train3msw, y_train)
X_val3msw = vect3msw.transform(text_val)
lr3msw.score(X_val3msw, y_val)

In [None]:
plt.figure(figsize=(15, 4))
plt.title("Adjusted Stopwords (1-3 gram)")
plot_important_features(lr3msw.coef_.ravel(), np.array(vect3msw.get_feature_names()), top_n=40)
plt.savefig("images/stopwords_3.png")

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [None]:
malory_tfidf = TfidfVectorizer().fit_transform(malory)
malory_tfidf.toarray()

In [None]:
malory_tfidf = make_pipeline(CountVectorizer(), TfidfTransformer()).fit_transform(malory)
malory_tfidf.toarray()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 3), min_df=4, stop_words=my_stopwords)
X_train_tfidf = tfidf.fit_transform(text_train)
lr = LogisticRegressionCV().fit(X_train_tfidf, y_train)
X_val_tfidf = tfidf.transform(text_val)
lr.score(X_val_tfidf, y_val)

# Character n-grams

In [None]:
print(malory)

In [None]:
cv = CountVectorizer(ngram_range=(2, 3), analyzer="char").fit(malory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
cv = CountVectorizer(ngram_range=(2, 3), analyzer="char_wb").fit(malory)
print("Vocabulary size: {}".format(len(cv.vocabulary_)))
print("Vocabulary:\n{}".format(cv.get_feature_names()))

In [None]:
char_vect = CountVectorizer(ngram_range=(2, 5), min_df=4, analyzer="char_wb")
X_train_char = char_vect.fit_transform(text_train)

In [None]:
len(char_vect.vocabulary_)

In [None]:
lr_char = LogisticRegressionCV().fit(X_train_char, y_train)
X_val_char = char_vect.transform(text_val)
lr_char.score(X_val_char, y_val)

In [None]:
plt.figure(figsize=(15, 4))
plot_important_features(lr_char.coef_.ravel(), np.array(char_vect.get_feature_names()), top_n=40)
plt.savefig("images/imdb_char_ngrams.png")

# Predicting nationalities from names

In [None]:
members.head()

In [None]:
members.shape

In [None]:
y_mem = members.country
data_mem = members.fullName
plt.figure(figsize=(8, 4))
(y_mem.value_counts() / y_mem.size).plot(kind='bar')

In [None]:
y_mem.value_counts()[:8]

In [None]:
large = y_mem.value_counts()[:8].index
large

In [None]:
mask = y_mem.isin(large)
data_mem = data_mem[mask]
y_mem = y_mem[mask]

In [None]:
(y_mem.value_counts() / y_mem.size)

In [None]:
data_mem.shape

In [None]:
text_mem_train, text_mem_test, y_mem_train, y_mem_test = train_test_split(data_mem, y_mem, stratify=y_mem, random_state=0)

In [None]:
bow_pipe = make_pipeline(CountVectorizer(), LogisticRegressionCV())
cross_val_score(bow_pipe, text_mem_train, y_mem_train, cv=5, scoring='f1_macro')

In [None]:
char_pipe = make_pipeline(CountVectorizer(analyzer="char_wb"), LogisticRegressionCV())
cross_val_score(char_pipe, text_mem_train, y_mem_train, cv=5, scoring='f1_macro')

In [None]:
char_pipe = make_pipeline(CountVectorizer(analyzer="char_wb", ngram_range=(1, 4)), LogisticRegressionCV())
cross_val_score(char_pipe, text_mem_train, y_mem_train, cv=5, scoring='f1_macro')

In [None]:
char_pipe.fit(text_mem_train, y_mem_train)

In [None]:
lr = char_pipe.named_steps['logisticregressioncv']
feature_names = np.array(char_pipe.named_steps['countvectorizer'].get_feature_names())
n_classes = len(lr.classes_)
fig, axes = plt.subplots(n_classes // 3 + 1, 3, figsize=(10, 4))
for ax, coef, label in zip(axes.ravel(), lr.coef_, lr.classes_):
    ax.set_title(label)
    plot_important_features(coef, feature_names, top_n=10, ax=ax)
    
plt.tight_layout()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

param_grid = {"logisticregression__C": [100, 10, 1, 0.1, 0.001],
              "countvectorizer__ngram_range": [(1, 1), (1, 2), (1, 5), (1, 7),
                                               (2, 3), (2, 5), (3, 8), (5, 5)],
              "countvectorizer__min_df": [1, 2, 3],
              "normalizer": [None, Normalizer()]
             }
grid = GridSearchCV(make_pipeline(CountVectorizer(analyzer="char"), Normalizer(), LogisticRegression()),
                    param_grid=param_grid, cv=10, scoring="f1_macro"
                   )


In [None]:
grid.fit(text_mem_train, y_mem_train)

In [None]:
grid.best_score_

In [None]:
grid.best_params_

In [None]:
results = pd.DataFrame(grid.cv_results_)
res_pivot = results.pivot_table(values=['mean_test_score', 'mean_train_score'],
                                index=["param_countvectorizer__ngram_range", "param_logisticregression__C",
                                       "param_countvectorizer__min_df"])

In [None]:
res_pivot.mean_test_score

In [None]:
bla = res_pivot.mean_test_score.unstack(["param_countvectorizer__ngram_range"])
bla = bla.swaplevel().sort_index()
bla.index.names = ['min_df', 'C']
bla.style.background_gradient(cmap="viridis")

In [None]:
lr = grid.best_estimator_.named_steps['logisticregression']
feature_names = np.array(grid.best_estimator_.named_steps['countvectorizer'].get_feature_names())
n_classes = len(lr.classes_)
fig, axes = plt.subplots(n_classes, figsize=(10, 20))
for ax, coef, label in zip(axes.ravel(), lr.coef_, lr.classes_):
    ax.set_title(label)
    plot_important_features(coef, feature_names, top_n=20, ax=ax)
    
plt.tight_layout()

# Hashing Vectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer()
X_train = hv.transform(text_train_sub)
X_val = hv.transform(text_val)

In [None]:
X_train.shape

In [None]:
lr = LogisticRegressionCV().fit(X_train, y_train_sub)
lr.score(X_val, y_val)