In this chapter you will:

* Clean and prepare text data
* Build feature vectors from text documents
* Train a machine learning model to classify positive and negative movie reviews
* Work with large text datasets using out-of-core learning

In [1]:
## Will be working with movie reviews from IMDB database
## Dataset is 50,000 reviews labeled as positive or negative
## Positive was rated with more than 6 stars on IMDb

## Read movie reviews into a Dataframe- may take 10 minutes

import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/%s/%s' % (s,l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile: 
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index = True)
                pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:27


In [2]:
import numpy as np 
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index = False)

In [3]:
df = pd.read_csv('./movie_data.csv')

In [4]:
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


### Bag-of-words model

1. Create a vocabulary of unique tokens i.e., words from the entire set of documents
2. Construct a feature vector from each document that contains the counts of how often each word occurs in the particular document

This will result in sparse vectors 

In [5]:
### Transforming words into feautre vectors

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [6]:
print (count.vocabulary_)

{u'and': 0, u'weather': 6, u'sweet': 4, u'sun': 3, u'is': 1, u'the': 5, u'shining': 2}


In [7]:
print(bag.toarray())
## referred to as "raw term frequencies": tf(t,d) the number of times
## term t appeared in document d

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [8]:
### Assessing word relevancy via term frequency-inverse document frequeny
## TF-idf - downweights frequently occurring words
## Defined as the product of term freqency and inverse document frequency
## Inverse doc frequency is

## log [(n_d)/(1+df(d,t))]
## where n_d is the total number of docuements 
## and df(d,t) is the number of docs d that contain term t

## Sci-kit learn has a transofrmer for this

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


### Cleaning Text Data

In [13]:
## strip out all unwanted characters

df.loc[0, 'review'][-50:]

'to Star Cinema!! Way to go, Jericho and Claudine!!'

In [9]:
### Had to edit from book- mistake in book near .join
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [15]:
preprocessor(df.loc[0, 'review'][-50:])

'to star cinema way to go jericho and claudine '

In [16]:
preprocessor("</a>This :) is :( a test :-) !")

'this is a test :) :( :)'

In [17]:
df['review']= df['review'].apply(preprocessor)

### Processing documents into tokens

In [10]:
## Need to figure out how to split the text into individual elements
## Can tokenize words by splitting at the whitespace characters

def tokenizer(text):
    return text.split()

tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [11]:
## Word stemming is taking the word root and mapping words that are similar
## nltk uses the porter stemming alorithm

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

[u'runner', 'like', u'run', 'and', u'thu', 'they', 'run']

In [12]:
## lemmatizaiton aims to obtain the canonical forms of individual words 
## stemming and lemmatization have little effect on performance

### Stop word removal 
## Because they are so common, stop words only have a minimal effect
## on the classification 

import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/andrew.moskowitz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', u'like', u'run', u'run', 'lot']

### Training a logistic regression model for document classification

In [22]:
## Divide dataframe into 25,000 training and 25,000 test

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [23]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents = None,
                       lowercase = False,
                       preprocessor = None)
param_grid = [{'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1,1)],
              'vect__stop_words': [stop, None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'vect__use_idf': [False],
              'vect__norm':[None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]}]

lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state = 0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv = 5, verbose = 1, n_jobs = -1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 33.8min


KeyboardInterrupt: 

In [24]:
print('Best Parameter set: %s ' % gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

Naieve bayes classifiers are also popular for this kind of work. Can read about them in:

S.Raschka Naive Bayes and Text Classification I - introduction and Theory. Computing Research Repository (CoRR), abs/1410.5329, 2014. Http://arxiv.org/pdf/1410.5329v3.pdf

### Out of core learning

Can stream little bits of data at a time to train the model and update the estimates

In [14]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[*>]*.', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [15]:
def stream_docs(path):
    with open(path, 'r') as csv:
        next(csv)
        for line in csv:
            text, label = line[:-3], int (line[-2])
            yield text, label

In [16]:
next(stream_docs(path='./movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [17]:
def get_minibatch(doc_stream, size):
    docs ,y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [18]:
### Use a hashing trick to be able to calculate counts out of memory

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                        n_features = 2**21,
                        preprocessor=None,
                        tokenizer = tokenizer)

clf = SGDClassifier(loss = 'log', random_state = 1, n_iter = 1)
doc_stream = stream_docs(path='./movie_data.csv')

In [19]:
## Initialized progress bar with 45 minibatches of 1000 docs each
## use the las 5000 for performance 

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    pbar.update()



0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:15


In [20]:
X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.866


In [21]:
## Add last 5000 docs to update the model
clf = clf.partial_fit(X_test, y_test)



A Popular extension of this model that accounts for structre and grammar is the LDA or Latent Dirichlet allocation

Word2vec is a more modern application of the bag-of-words model 
    uses neural networks to automatically learn relationships betweenw ords

# Chapter 9 Embedding a Machine Learning Model into a Web Application

Session was kept open at the suggestion of the authors as we use the same model that was geneated in the previous chapter

One way for "model persistence" (being able to reuse a trained model) is serializing and deserializing our python objects. this allows us to save and reload the current state of our model. 

In [22]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop,
           open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol = 2)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol = 2)

The next bit to test the serializer and vetorizer was done in an ipython session

### Setting up a SQLite database for data storage

In [23]:
### Create a new sql lite database inside movieclassifier to
### collect optional feedback about predictions from users

import sqlite3
import os
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()

OperationalError: table review_db already exists

In [24]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db where date BETWEEN '2018-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[(u'I love this movie', 1, u'2018-03-19 03:49:07'), (u'I disliked this movie', 0, u'2018-03-19 03:49:07')]


In [45]:
import flask

In [47]:
import wtforms