In [None]:
! wget -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -P data
! tar -xzf data/aclImdb_v1.tar.gz -C data

In [None]:
!tree -dL 2 data/aclImdb

In [None]:
!rm -r data/aclImdb/train/unsup


In [None]:
from sklearn.datasets import load_files


In [None]:
%matplotlib inline


# pip install mglearn # Note that for my new MacBook, I need to install this package.
import mglearn # https://github.com/amueller/introduction_to_ml_with_python/tree/master/mglearn
# from preamble import *

import numpy as np
# import matplotlib.pyplot as plt
import pandas as pd

from IPython.display import display



from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))
print("length of text_train: {}".format(len(text_train)))

print(20*"*")
for i in range(7):
    print(20*"*")
    print(f"text_train[{i}]:\n{text_train[i]}")
    print(20*"*")

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train] # Replaces all the HTML line break tags.

In [None]:
print(20*"*")
for i in range(7):
    print(20*"*")
    print("text_train[i]:\n{}".format(text_train[i]))
    print(20*"*")

In [None]:
type(text_train[0])

Should read documentation on strings and Unicode:  [Unicode](https://docs.python.org/3/howto/unicode.html

In [None]:
np.unique(y_train)

In [None]:
print("Samples per class (training): {}".format(np.bincount(y_train)))

In [None]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

### 7.3. Representing Text Data as a Bag of Words

In [None]:
bards_words =["The fool doth think he is wise,",
              "but the wise man knows himself to be a fool"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(bards_words) # Labels every word in the vocabulary with a number.

In [None]:
print("Vocabulary size: {}".format(len(vect.vocabulary_)))
print("Vocabulary content:\n {}".format(vect.vocabulary_))

In [None]:
# each sentence is turned into a bag of words (array) with the count of each word in the array listed in order based
# on the labels assigned by vect.vocabulary_.
bag_of_words = vect.transform(bards_words) 
print("bag_of_words: {}".format(repr(bag_of_words)))

In [None]:
print("Dense representation of bag_of_words:\n{}".format(
    bag_of_words.toarray()))

### 7.3.2 Bag-of-word for movie reviews



In [None]:
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names_out() # Note that this command is the new version
# feature_names = vect.get_feature_names() # This version of the command has been deprecated since book was published.
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[100:900]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)


In [None]:
X_test = vect.transform(text_test)
print("Test score: {:.2f}".format(grid.score(X_test, y_test)))

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df: {}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names()

print("First 50 features:\n{}".format(feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(feature_names[::700]))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

### 7.4 Stopwords

### [Start here](https://github.com/amueller/introduction_to_ml_with_python/blob/master/07-working-with-text-data.ipynb) with input 22 (pp. 341 in the book).



In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))