# Chapter 8: Applying Machine Learning to Sentiment Analysis

In [1]:
import pyprind
import pandas as pd
import os

KeyboardInterrupt: 

In [None]:
pbar = pyprind.ProgBar(50000)
labels = {'pos': 1, 'neg' : 0}
df = pd.DataFrame()

In [None]:
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/{}/{}'.format(s, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file), 
                      'r', encoding='utf8') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
                pbar.update()

In [None]:
import numpy as np

In [None]:
np.random.seed(0)

In [None]:
df = df.reindex(np.random.permutation(df.index))

In [None]:
df.to_csv('movie_data.csv', index=False, encoding='utf8')

In [None]:
df = pd.read_csv('movie_data.csv')
df.head(3)

## Bag of Words Model

In [None]:
from sklearn.feature_extraction.text import  CountVectorizer

In [None]:
count = CountVectorizer()

In [None]:
docs = np.array(['The sun is shining',
                'The weather is sweet',
                'The sun is shining and the weather is sweet'])

In [None]:
bag = count.fit_transform(docs)

In [None]:
print(count.vocabulary_)

In [None]:
print(bag.toarray())

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf = TfidfTransformer()

In [None]:
np.set_printoptions(precision=2)

In [None]:
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

### Cleaning Text Data

In [None]:
df.columns = ['review', 'sentiment']

In [None]:
df.loc[0, 'review'][-50:]

In [None]:
import re

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')
    return text

In [None]:
preprocessor('<a> This :) is a :( test </a>)')

In [None]:
df['review'] = df['review'].apply(preprocessor)

## Processing documents into tokens

In [None]:
def tokenizer(text):
    return text.split()

In [None]:
tokenizer('runners like running and thus they run')

In [None]:
from nltk.stem.porter import PorterStemmer

In [None]:
porter = PorterStemmer()

In [None]:
def tokenizer_porter(text):
    return [ porter.stem(word) for
           word in text.split()]

In [None]:
tokenizer_porter('runners like running and thus they run')

#### Remove Stop Words

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
stop = stopwords.words('english')

In [None]:
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

## Training a logistic regression model for document classification

In [None]:
X_train = df.loc[:25000, 'review'].values
X_test = df.loc[25000:, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)

In [None]:
param_grid = [{'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer, tokenizer_porter],
              'clf__penalty':['l1', 'l2'],
              'clf__C':[1.0, 10.0, 100.0]},
              {'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer':[tokenizer, tokenizer_porter],
              'vect__use_idf':[False],
              'vect__norm':[None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C':[1.0, 10.0, 100.0]}]

In [None]:
lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(random_state=0))
])

In [None]:
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid=param_grid,
                          scoring='accuracy', cv=3,
                          verbose=1)

In [None]:
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

## Working with bigger data

In [None]:
import numpy as np

In [None]:
import re

In [None]:
from nltk.corpus import stopwords

In [None]:
stop = stopwords.words('english')

In [None]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [None]:
def stream_docs(path):
    with open(path, 'r', encoding='utf8') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [None]:
next(stream_docs(path='./movie_data.csv'))

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

In [None]:
vect = HashingVectorizer(decode_error='ignore',
                        n_features =2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)

In [None]:
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

In [None]:
doc_stream = stream_docs(path='./movie_data.csv')

In [None]:
import pyprind

In [None]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])

In [None]:
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream=doc_stream, size=5000)

In [None]:
X_test = vect.transform(X_test)

In [None]:
print('Accuracy: {:.3f}'.format(clf.score(X_test, y_test)))

In [None]:
clf = clf.partial_fit(X_test, y_test)

In [None]:
print('Accuracy: {:.3f}'.format(clf.score(X_test, y_test)))

In [None]:
import pickle
import os

In [None]:
dest = os.path.join('movieclassifier', 'pkl_objects')

In [None]:
if not os.path.exists(dest):
    os.makedirs(dest)

In [None]:
pickle.dump(stop,
           open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
           protocol=4)

In [None]:
pickle.dump(clf, 
           open(os.path.join(dest, 'classifier.pkl'), 'wb'),
               protocol=4)