<a href="https://colab.research.google.com/github/alextanhongpin/python-machine-learning/blob/master/08_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# https://ai.stanford.edu/~amaas/data/sentiment/
!curl https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz -o aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  48.5M      0  0:00:01  0:00:01 --:--:-- 48.5M


In [2]:
!tar -zxf aclImdb_v1.tar.gz

In [3]:
!pip install pyprind



In [4]:
import os

import pandas as pd
import pyprind

In [5]:
basepath = "aclImdb"

labels = {"pos": 1, "neg": 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()

for s in ("test", "train"):
    for l in ("pos", "neg"):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), "r", encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ["review", "sentiment"]

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:29


In [6]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("movie_data.csv", index=False, encoding="utf-8")

In [7]:
df = pd.read_csv("movie_data.csv", encoding="utf-8")
df.head(3)

Unnamed: 0,review,sentiment
0,"I loved the movie ""Northfork"". I knew nothing ...",1
1,publicity got me to the theatre<br /><br />adv...,0
2,I will never forget the night I saw this movie...,0


In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(
    [
        "The sun is shining",
        "The weather is sweet",
        "The sun is shining and the weather is sweet",
    ]
)

bag = count.fit_transform(docs)
bag.toarray()

array([[0, 1, 1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1, 1, 1],
       [1, 2, 1, 1, 1, 2, 1]])

In [9]:
count.vocabulary_

{'and': 0, 'is': 1, 'shining': 2, 'sun': 3, 'sweet': 4, 'the': 5, 'weather': 6}

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm="l2", smooth_idf=True)
np.set_printoptions(precision=2)
tfidf.fit_transform(count.fit_transform(docs)).toarray()

array([[0.  , 0.43, 0.56, 0.56, 0.  , 0.43, 0.  ],
       [0.  , 0.43, 0.  , 0.  , 0.56, 0.43, 0.56],
       [0.4 , 0.48, 0.31, 0.31, 0.31, 0.48, 0.31]])

In [11]:
df.loc[0, "review"][-50:]  # Print last 50 characters from the review.

' watch it again... I know I will.<br /><br />Terry'

In [12]:
import re


def preprocessor(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    return text

In [13]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [14]:
df.review = df.review.apply(preprocessor)

In [15]:
def tokenizer(text):
    return text.split()


tokenizer("runners like running and thus they run")

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [16]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]


tokenizer_porter("runners like running and thus they run")

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [17]:
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
[
    w
    for w in tokenizer_porter("a runner likes runnign and runs a lot")[-10:]
    if w not in stop
]

['runner', 'like', 'runnign', 'run', 'lot']

# Training a logistic regression model for document classification

In [19]:
X_train = df.loc[:25000, "review"].values
y_train = df.loc[:25000, "sentiment"].values

X_test = df.loc[25000:, "review"].values
y_test = df.loc[25000:, "sentiment"].values

In [20]:
!pip install tune_sklearn

Collecting tune_sklearn
  Downloading tune_sklearn-0.4.1-py3-none-any.whl (40 kB)
[K     |████████████████████████████████| 40 kB 3.7 MB/s 
[?25hCollecting ray[tune]
  Downloading ray-1.10.0-cp37-cp37m-manylinux2014_x86_64.whl (59.6 MB)
[K     |████████████████████████████████| 59.6 MB 47 kB/s 
Collecting redis>=3.5.0
  Downloading redis-4.1.3-py3-none-any.whl (173 kB)
[K     |████████████████████████████████| 173 kB 41.2 MB/s 
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.4.1-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 46.4 MB/s 
Collecting deprecated>=1.2.3
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: deprecated, redis, tensorboardX, ray, tune-sklearn
Successfully installed deprecated-1.2.13 ray-1.10.0 redis-4.1.3 tensorboardX-2.4.1 tune-sklearn-0.4.1


In [21]:
# https://towardsdatascience.com/5x-faster-scikit-learn-parameter-tuning-in-5-lines-of-code-be6bdd21833c
from tune_sklearn import TuneGridSearchCV

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [stop, None],
        "vect__tokenizer": [tokenizer, tokenizer_porter],
        "clf__penalty": ["l1", "l2"],
        "clf__C": [1.0, 10.0, 100.0],
    },
    {
        "vect__ngram_range": [(1, 1)],
        "vect__stop_words": [stop, None],
        "vect__tokenizer": [tokenizer, tokenizer_porter],
        "vect__use_idf": [False],
        "vect__norm": [None],
        "clf__penalty": ["l1", "l2"],
        "clf__C": [1.0, 10.0, 100.0],
    },
]

lr_tfidf = Pipeline(
    [("vect", tfidf), ("clf", LogisticRegression(random_state=0, solver="liblinear"))]
)
# gs_lr_tfidf = GridSearchCV(
gs_lr_tfidf = TuneGridSearchCV(
    lr_tfidf, param_grid, scoring="accuracy", cv=5, verbose=2, n_jobs=-1,
    # Two new parameters.
    early_stopping=True,
    max_iters=10
)
gs_lr_tfidf.fit(X_train, y_train)

  "The `loggers` argument is deprecated. Please pass the respective "
save not implemented for Searcher. Skipping save.


[2m[36m(_PipelineTrainable pid=469)[0m   % sorted(inconsistent)












Trial _PipelineTrainable_c627b7dc reported average_test_score=0.88 with parameters={'early_stopping': True, 'early_stop_type': <EarlyStopping.WARM_START_ITER: 2>, 'X_id': ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000001000000), 'y_id': ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000002000000), 'groups': None, 'cv': StratifiedKFold(n_splits=5, random_state=None, shuffle=False), 'fit_params': {}, 'scoring': {'score': make_scorer(accuracy_score)}, 'max_iters': 10, 'return_train_score': False, 'n_jobs': 1, 'metric_name': 'average_test_score', 'estimator_ids': [ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000003000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000004000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000005000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000006000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000007000000)], 'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 've











Trial _PipelineTrainable_c627b7dc reported average_test_score=0.88 with parameters={'early_stopping': True, 'early_stop_type': <EarlyStopping.WARM_START_ITER: 2>, 'X_id': ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000001000000), 'y_id': ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000002000000), 'groups': None, 'cv': StratifiedKFold(n_splits=5, random_state=None, shuffle=False), 'fit_params': {}, 'scoring': {'score': make_scorer(accuracy_score)}, 'max_iters': 10, 'return_train_score': False, 'n_jobs': 1, 'metric_name': 'average_test_score', 'estimator_ids': [ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000003000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000004000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000005000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000006000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000007000000)], 'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 've











Trial _PipelineTrainable_c627b7dc reported average_test_score=0.88 with parameters={'early_stopping': True, 'early_stop_type': <EarlyStopping.WARM_START_ITER: 2>, 'X_id': ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000001000000), 'y_id': ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000002000000), 'groups': None, 'cv': StratifiedKFold(n_splits=5, random_state=None, shuffle=False), 'fit_params': {}, 'scoring': {'score': make_scorer(accuracy_score)}, 'max_iters': 10, 'return_train_score': False, 'n_jobs': 1, 'metric_name': 'average_test_score', 'estimator_ids': [ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000003000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000004000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000005000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000006000000), ObjectRef(ffffffffffffffffffffffffffffffffffffffff0100000007000000)], 'clf__C': 1.0, 'clf__penalty': 'l1', 'vect__ngram_range': (1, 1), 've

In [None]:
print(f"Best parameter set: {gs_lr_tfidf.best_params_}")
print(f"CV Accuracy: {gs_lr_tfidf.best_score_:.3f}")

clf = gs_lr_tfidf.best_estimator_
print("Test Accuracy: {0:.3f}".format(clf.score(X_test, y_test)))

In [None]:
import os
import zipfile

if not os.path.isfile("movie_data.csv"):
    if not os.path.isfile("movie_data.csv.zip"):
        print(
            "Please place a copy of the movie_data.csv.gz"
            "in this directory. You can obtain it by"
            "a) executing the code in the beginning of this"
            "notebook or b) by downloading it from GitHub:"
            "https://github.com/rasbt/python-machine-learning-"
            "book-2nd-edition/blob/master/code/ch08/movie_data.csv.gz"
        )
    else:
        with zipfile.ZipFile("movie_data.csv.zip", "r") as zip_ref:
            zip_ref.extractall(".")

In [None]:
import re

import numpy as np
from nltk.corpus import stopwords

stop = stopwords.words("english")


def tokenizer(text):
    text = re.sub("<[^>]*>", "", text)
    emoticons = re.findall("(?::|;|=)(?:-)?(?:\)|\(|D|P)", text)
    text = re.sub("[\W]+", " ", text.lower()) + " ".join(emoticons).replace("-", "")
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
def stream_docs(path):
    with open(path, "r", encoding="utf-8") as csv:
        next(csv)  # Skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [None]:
next(stream_docs(path="movie_data.csv"))

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(
    decode_error="ignore", n_features=2 ** 21, preprocessor=None, tokenizer=tokenizer
)

clf = SGDClassifier(loss="log", random_state=1, max_iter=1)
doc_stream = stream_docs(path="movie_data.csv")

In [None]:
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print("Accuracy: {0:.3f}".format(clf.score(X_test, y_test)))

In [None]:
clf = clf.partial_fit(X_test, y_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words="english", max_df=0.1, max_features=5000)
X = count.fit_transform(df["review"].values)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(
    n_components=10, random_state=123, learning_method="batch"
)
X_topics = lda.fit_transform(X)

In [None]:
lda.components_.shape

In [None]:
n_top_words = 5
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic {0}".format(topic_idx + 1))
    print(
        " ".join(
            [feature_names[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]
        )  # Sort in reverse orders
    )

In [None]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print("Horror movie #{0}".format(iter_idx + 1))
    print(df["review"][movie_idx][:300], "...")

# Serializing fitted scikit-learn estimators

- training machine learning is computationally expensive
- pickled allow us to serialize/deserialize Python object structures to compact bytecode

In [None]:
import os
import pickle

dest = os.path.join("movieclassifier", "pkl_objects")
if not os.path.exists(dest):
    os.makedirs(dest)


pickle.dump(stop, open(os.path.join(dest, "stopwords.pkl"), "wb"), protocol=4)
pickle.dump(clf, open(os.path.join(dest, "classifier.pkl"), "wb"), protocol=4)