# Text Data in scikit-learn

In [None]:
import matplotlib.pyplot as plt
import sklearn

sklearn.set_config(display='diagram')

In [None]:
from pathlib import Path
import tarfile
from urllib import request

data_path = Path("data")
extracted_path = Path("data") / "aclImdb"
imdb_path = data_path / "aclImdb_v1.tar.gz"

def download_imdb():
    if imdb_path.exists():
        print("imdb dataset already downloaded")
        return
        
    url_path = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    with request.urlopen(url_path) as url_f:
        with imdb_path.open('wb') as local_f:
            local_f.write(url_f.read())

def download_and_untar_imdb():
    download_imdb()
    if extracted_path.exists():
        print("imdb dataset already extracted")
        return
    with tarfile.open(imdb_path, "r") as tar_f:
        tar_f.extractall(data_path)
            
# This may take some time to run since it will download and extracted
download_and_untar_imdb()

## CountVectorizer

In [None]:
sample_text = ["Can we go to the hill? I finished my homework.",
               "The hill is very tall. Please be careful"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vect = CountVectorizer()
vect.fit(sample_text)

vect.get_feature_names()

In [None]:
X = vect.transform(sample_text)
X

In [None]:
X.toarray()

### Bag of words

In [None]:
sample_text

In [None]:
X_inverse = vect.inverse_transform(X)

In [None]:
X_inverse[0]

In [None]:
X_inverse[1]

## Loading text data with scikit-learn

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files(extracted_path / "train", categories=["neg", "pos"])
raw_text_train, raw_y_train = reviews_train.data, reviews_train.target
raw_text_train = [doc.replace(b"<br />", b" ") for doc in raw_text_train]

In [None]:
import numpy as np
np.unique(raw_y_train)

In [None]:
np.bincount(raw_y_train)

In [None]:
len(raw_text_train)

In [None]:
raw_text_train[5]

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split

text_train, text_val, y_train, y_val = train_test_split(
    raw_text_train, raw_y_train, stratify=raw_y_train, random_state=0)

### Transform training data

In [None]:
vect = CountVectorizer()
X_train = vect.fit_transform(text_train)

In [None]:
len(text_train)

In [None]:
X_train

### Transform validation set

In [None]:
len(text_val)

In [None]:
X_val = vect.transform(text_val)

In [None]:
X_val

### Extract feature names

In [None]:
feature_names = vect.get_feature_names()

In [None]:
feature_names[10000:10020]

In [None]:
feature_names[::3000]

### Linear model for classification

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear', random_state=42).fit(X_train, y_train)

In [None]:
lr.score(X_val, y_val)

In [None]:
def plot_important_features(coef, feature_names, top_n=20, ax=None, rotation=40):
    if ax is None:
        ax = plt.gca()
    feature_names = np.asarray(feature_names)
    inds = np.argsort(coef)
    high = inds[-top_n:]
    important = np.hstack([low, high])
    myrange = range(len(important))
    colors = ['red'] * top_n + ['blue'] * top_n
    
    ax.bar(myrange, coef[important], color=colors)
    ax.set_xticks(myrange)
    ax.set_xticklabels(feature_names[important], rotation=rotation, ha="right")
    ax.set_xlim(-.7, 2 * top_n)
    ax.set_frame_on(False)

In [None]:
feature_names = vect.get_feature_names()

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
plot_important_features(lr.coef_.ravel(), feature_names, top_n=20, ax=ax)

## Exercise 1

1. Train a `sklearn.ensemble.RandomForestClassifier` on the training set, `X_train` and `y_train`.
2. Evalute the accuracy on the validation set.
3. What are the top 20 important features accourind go `feature_importances_` of the random forst.

In [None]:
# %load solutions/01-ex01-solutions.py
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

rf.score(X_val, y_val)

rf_feature_importance = rf.feature_importances_
top_rf_importance_indices = rf_feature_importance.argsort()[::-1][:20]

top_rf_important_features = np.array(feature_names)[top_rf_importance_indices]
top_rf_important_features

## CountVectorizer Options

In [None]:
sample_text = ["Can we go to the hill? I finished my homework.",
               "The hill is very tall. Please be careful"]

In [None]:
vect = CountVectorizer()
vect.fit(sample_text)
vect.get_feature_names()

### Stop words

In [None]:
vect = CountVectorizer(stop_words='english')
vect.fit(sample_text)
vect.get_feature_names()

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print(list(ENGLISH_STOP_WORDS))

### Max features

In [None]:
vect = CountVectorizer(max_features=4, stop_words='english')
vect.fit(sample_text)
vect.get_feature_names()

### Min frequency on the imdb dataset

With `min_df=1`

In [None]:
X_train.shape

`min_df=5`

In [None]:
vect = CountVectorizer(min_df=4)
X_train_min_df_4 = vect.fit_transform(text_train)

In [None]:
X_train_min_df_4.shape

In [None]:
lr_df_4 = LogisticRegression(solver='liblinear', random_state=42).fit(X_train_min_df_4, y_train)

In [None]:
X_val_min_df_4 = vect.transform(text_val)

#### Scores with different min frequencies

In [None]:
lr_df_4.score(X_val_min_df_4, y_val)

In [None]:
lr.score(X_val, y_val)

## Pipelines and Vectorizers

In [None]:
from sklearn.pipeline import Pipeline

log_reg = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
])

In [None]:
log_reg

In [None]:
text_train[:2]

In [None]:
log_reg.fit(text_train, y_train)

In [None]:
log_reg.score(text_train, y_train)

In [None]:
log_reg.score(text_val, y_val)

## Exercise 2

1. Create a pipeline with a `CountVectorizer` with `min_df=5` and `stop_words='english'` and a `RandomForestClassifier`.
2. What is the score of the random forest on the validation dataset?

In [None]:
# %load solutions/01-ex02-solutions.py

## Bigrams

`CountVectorizer` takes a `ngram_range` parameter

In [None]:
sample_text

In [None]:
cv = CountVectorizer(ngram_range=(1, 1)).fit(sample_text)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:", cv.get_feature_names())

In [None]:
cv = CountVectorizer(ngram_range=(2, 2)).fit(sample_text)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names())

In [None]:
cv = CountVectorizer(ngram_range=(1, 2)).fit(sample_text)
print("Vocabulary size:", len(cv.vocabulary_))
print("Vocabulary:")
print(cv.get_feature_names())

## n-grams with stop words

In [None]:
cv_n_gram = CountVectorizer(ngram_range=(1, 3), min_df=4, stop_words="english")
cv_n_gram.fit(text_train)

In [None]:
len(cv_n_gram.vocabulary_)

In [None]:
print(cv_n_gram.get_feature_names()[::2000])

In [None]:
pipe_cv_n_gram = Pipeline([
    ('vectorizer', cv_n_gram),
    ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
])

In [None]:
pipe_cv_n_gram.fit(text_train, y_train)

In [None]:
pipe_cv_n_gram.score(text_train, y_train)

In [None]:
pipe_cv_n_gram.score(text_val, y_val)

In [None]:
feature_names = pipe_cv_n_gram['vectorizer'].get_feature_names()

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
plot_important_features(pipe_cv_n_gram['classifier'].coef_.ravel(), feature_names, top_n=20, ax=ax)

## Tf-idf rescaling

In [None]:
sample_text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidvect = TfidfVectorizer().fit(sample_text)
tfid_trans = tfidvect.transform(sample_text)

In [None]:
tfid_trans.toarray()

## Train on the imdb dataset

In [None]:
log_reg_tfid = Pipeline([
   ('vectorizer', TfidfVectorizer(ngram_range=(1, 3), min_df=4,
                                  stop_words="english")),
   ('classifier', LogisticRegression(random_state=42, solver='liblinear'))
])

In [None]:
log_reg_tfid.fit(text_train, y_train)

In [None]:
log_reg_tfid.score(text_val, y_val)

## Exercise 3

1. How many samples are there in the training dataset and test dataset?
1. Construct a pipeline with a `TfidfVectorizer` and `LogisticRegression`.
1. Evalute the pipeline on the test set.
1. Plot the feature importances using `plot_important_features`.

In [None]:
# %load solutions/01-ex03-solutions.py