# Creating an NLP estimator #

## Step 1: Reading the data ##

For this ML app, we will create a movie classifier using the dataset available at http://ai.stanford.edu/~amaas/data/sentiment/

In [24]:
import pandas as pd
import io
import pyprind
import os
import numpy as np
import pandas as pd

basepath = './aclImdb'
# positive and negative review is the label we are looking to predict
labels = {'pos': 1, 
          'neg': 0}
pbar = pyprind.ProgBar(50000)
# dataframe to store the final preprocessed data
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        # create the path directory /aclImdb/test/pos,/aclImdb/train/pos etc
        path = os.path.join('./aclImdb',s,l)
        for f in os.listdir(path):
            with io.open(os.path.join(path, f), 'r', encoding='utf-8') as infile:
                txt = infile.read()

            df = df.append([[txt, l]], ignore_index = True)
            pbar.update()
df.columns = ['review', 'sentiment']        

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:03:14


In [27]:
# replacing pos with 1 and neg with 0
df['sentiment'] = df['sentiment'].map({'pos' : 1, 'neg' : 0})
df.head(3)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1


In [34]:
import numpy as np
import pandas as pd
# shuffling the dataframe
df = df.reindex(np.random.permutation(df.index))
# save data to csv
df.to_csv('./movie_data.csv', index = False, encoding="utf-8")
# read in data from csv
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,This movie was one of the best I have ever see...,1
1,"An unjustly neglected classic, ""Intruder in th...",1
2,"IMDb lists this as 1972 for some reason, but t...",1


In [35]:
df.tail(3)

Unnamed: 0,review,sentiment
49997,I really liked this film about love between tw...,1
49998,A few years back the same persons who created ...,0
49999,It's unfortunate that you can't go any lower t...,0


## Step 2: Cleaning the data ##

There are 3 main goals here:
* Remove all HTML markup.
* Remove all punctuations and non word characters
* Retain emoticons 

In [37]:
import re
def preprocessor(text):
    # remove all HTML markup
    text = re.sub('<[^>]*>', '', text)
    # retain emoticons but remove the '-' nose
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # remove all punctuation
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [39]:
# Apply the cleaning step
df['review'] = df['review'].apply(preprocessor)

In [41]:
# test to see if it worked
df.tail(3)

Unnamed: 0,review,sentiment
49997,i really liked this film about love between tw...,1
49998,a few years back the same persons who created ...,0
49999,it s unfortunate that you can t go any lower t...,0


## Step 3: Tokenization ##

In [42]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

# split words based on whitespace
def tokenizer(text):
    return text.split()

# apply porter tokenization 
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [44]:
# test the PorterStemmer tokenizer 
tokenizer_porter('climbers like climbing and thus they climb')

[u'climber', u'like', u'climb', u'and', u'thu', u'they', u'climb']

In [43]:
# Get stop word list
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adarshnair/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [49]:
from nltk.corpus import stopwords

# test to remove stop words
stop = stopwords.words('english')
[w for w in tokenizer_porter('the climbers like climbing and thus they climb a lot')[-10:]
if w not in stop]

[u'climber', u'like', u'climb', u'thu', u'climb', u'lot']

## Step 4: Training the regression model ##

### Step 4.1: Split data ###

In [45]:
# Split the data into training and testing sets
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

### Step 4.2: Find optimal parameters using k-fold stratified cross validation ###

In [48]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# apply tf-idf vectorization
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# cycle through the following combination of parameters
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, 
                           param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [60]:
# Fit the model on the training data
gs_lr_tfidf.fit(X_train, y_train)

In [None]:
# Find the best parameters and compute the best score
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

# gs_lr_tfidf.best_score_ is the average k-fold cross-validation score. 
# If we have a GridSearchCV object with 5-fold cross-validation (like the one above), 
# the best_score_ attribute returns the average score over the 5-folds of the best model. 
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

In [None]:
# use the best estimator to test the model on the testing dataset
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

As you will see, this process takes way too long for most regular RAM equipped systems to handle. To alleviate this, we will proceed with an out of core learning approach.

## Step 5: Big data: Out of core learning ##

Out of core learning approaches allow us to work with data that our RAM cannot handle. In the approach we followed earlier, it took us over an hour to train our model with 50,000 documents. In the following approach, we will use a generator function stream_docs that reads in and returns documents one at a time. 

In [52]:
# Generator function 
def get_document(path):
    with io.open(path, 'r', encoding = 'utf-8') as csv:
        # skip the header line
        next(csv)
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [54]:
# Testing the generator function to see if it returns only the first document out of the 50,000
next(get_document(path = './movie_data.csv'))

(u'"This movie was one of the best I have ever seen. Just the other day I was reminded of this movie by something on TV. It came back to me like a dam flooding over. I have never been more touched by a movie than by this one. After the movie was over I actually could not quit crying for about 2 hours. No movie has ever moved me that way before. I was 15 at the time of the movie and have not seen it since but am hoping I can find a copy to buy so that I can watch it whenever I want to. If someone suggests you see this movie with them, GO....you will not be disappointed.<br /><br />Peggy Fries"',
 1)

Now that we have a generator function that will return one document at a time, our next goal is to define a function that can return a set number of documents based on a parameter that we can set. 

In [55]:
def document_batch(get_document, size):
    # docs is defined for the text and y for their corresponding labels
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(get_document)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

Out of core learning does come with its set of caveats. For instance we cannot use Count Vectorizer since it needs the complete vocabulary in memory. We also cannot use tf-idf vectorizer as it needs all the feature vectors of the training dataset in memory to calculate the inverse document frequencies. 

Therefore we will be using the Hashing Vectorizer which is data independent and makes use of the 32 bit MurmurHash3 algorithm(https://sites.google.com/site/murmurhash/)

In [56]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', 
                         # we set a large value for number of features to reduce hash collisions
                         n_features=2**21,
                         preprocessor=None, 
                         tokenizer=tokenizer)

clf = SGDClassifier(loss = 'log',
                   random_state = 1,
                   n_iter = 1)

doc_stream = get_document(path = './movie_data.csv')

Now that we have our classifier, we can train our model.

In [57]:
pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    # we iterate over 45 batches each of size 1000 documents
    X_train, y_train = document_batch(doc_stream, 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,
                    y_train,
                    classes = classes)
    pbar.update()


0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:11


Once the incremental learning process is complete, we use the last 5000 documents to evaluate our performance.

In [58]:
X_test, y_test = document_batch(doc_stream,
                                size = 5000)
X_test = vect.transform(X_test)
print 'Accuracy = ', clf.score(X_test, y_test)

Accuracy =  0.8074


We use the last 5000 documents to update our model.

In [59]:
clf = clf.partial_fit(X_test,
                      y_test)

Our NLP estimator can predict the sentiment of reviews with an 80% accuracy rate which is lower than when we used an in core approach, but we were able to come to a solution in 11 seconds as compared to >60 minutes.

## Step 6: Pickle our classifier for deploying onto the web app ##

In [62]:
import pickle
import os
dest = os.path.join('Movie_Classifier', 'pkl_objects')
# create the Movie_Classifier directory
if not os.path.exists(dest):
	os.makedirs(dest)

# we serialize the stop words and store it in stopwords.pkl
pickle.dump(stop,
	open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
	protocol = 2)

# we serialize the classifier and store it in classifier.pkl
pickle.dump(clf,
	open(os.path.join(dest, 'classifier.pkl'), 'wb'),
	protocol = 2)