<h1>
Sentiment analysis
</h1>
<ul>
<li>Dataset used: IMDb movies dataset(http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz)</li>

In [1]:
import pyprind
import pandas as pd
import os
import io

In [2]:
"""
Organise the given dataset into operatable datastructure
We shall use Pandas DataFrames
"""
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}

df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with io.open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:09:08


In [3]:
'''
save the organised data in a csv file
'''
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False, encoding='utf-8')

In [4]:
'''
load data from the csv file
'''
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,Anne (Natalie Portman) tells us about how much...,1
1,First: a warning.<br /><br />I recently saw th...,0
2,"I saw Chan Is Missing when it first came out, ...",0


In [5]:
"""
preprocessing data
"""

import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)  #removes the HTML markup
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)   #finds emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')  
    return text

df['review'] = df['review'].apply(preprocessor)

In [6]:
preprocessor(df.loc[0, 'review'][-50:])    # example

's also great the movie is incredible 10 out of 10 '

In [7]:
preprocessor("</a>This :) is :( a test :-)!")   # example

'this is a test :):(:)'

In [8]:
#Processing documents into tokens
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

# exemplary run
tokenizer_porter('runners like running and thus they run')

[u'runner', u'like', u'run', u'and', u'thu', u'they', u'run']

In [9]:
import nltk
nltk.download('stopwords')
"""
stopwords are those words that are very common in a language
and are thus likely to bear little to no value
There are 127 such stopwords in nltk library for
english language."""

[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'\nstopwords are those words that are very common in a language\nand are thus likely to bear little to no value\nThere are 127 such stopwords in nltk library for\nenglish language.'

In [10]:
# example: how to use
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs alot')[-10:] if w not in stop]

[u'runner', u'like', u'run', u'run', u'alot']

<h2>Training a logistic regression model for document classification</h2>

In [11]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

In [12]:
#we will use simple bag-of-words model
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
if Version(sklearn_version) < '0.18':
    from sklearn.grid_search import GridSearchCV
else:
    from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

In [13]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 25.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x7f709890d7d0>, <function tokenizer_porter at 0x7f709890d848>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves...709890d848>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1

In [14]:
print('Best parameter set: %s '% gs_lr_tfidf.best_params_)
print('CV Accuracy: %.3f'%gs_lr_tfidf.best_score_)

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function tokenizer at 0x7f709890d7d0>, 'clf__penalty': 'l2', 'clf__C': 10.0, 'vect__stop_words': None} 
CV Accuracy: 0.892


In [15]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.901


<h4>Start Comment:</h4>

In [16]:
"""
Please note that gs_lr_tfidf.best_score_ is the average k-fold cross-validation score. 
I.e., if we have a GridSearchCV object with 5-fold cross-validation (like the one above), 
the best_score_ attribute returns the average score over the 5-folds of the best model.
"""

from sklearn.linear_model import LogisticRegression
import numpy as np
if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.cross_validation import cross_val_score
else:
    from sklearn.model_selection import StratifiedKFold
    from sklearn.model_selection import cross_val_score

np.random.seed(0)
np.set_printoptions(precision=6)
y = [np.random.randint(3) for i in range(25)]
X = (y + np.random.randn(25)).reshape(-1, 1)

if Version(sklearn_version) < '0.18':
    cv5_idx = list(StratifiedKFold(y, n_folds=5, shuffle=False, random_state=0))

else:
    cv5_idx = list(StratifiedKFold(n_splits=5, shuffle=False, random_state=0).split(X, y))
    
cross_val_score(LogisticRegression(random_state=123), X, y, cv=cv5_idx)

array([ 0.6,  0.4,  0.6,  0.2,  0.6])

In [17]:
if Version(sklearn_version) < '0.18':
    from sklearn.grid_search import GridSearchCV
else:
    from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(LogisticRegression(), {}, cv=cv5_idx, verbose=3).fit(X, y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.400000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.200000 -   0.0s
[CV]  ................................................................
[CV] ....................................... , score=0.600000 -   0.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [18]:
gs.best_score_

0.47999999999999998

In [19]:
cross_val_score(LogisticRegression(), X, y, cv=cv5_idx).mean()

0.47999999999999998

<h4>End Comment</h4>




<h1>Working with bigger data - online algorithms and out-of-core learning</h1>

In [20]:
import numpy as np
import re
from nltk.corpus import stopwords

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with io.open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [21]:
next(stream_docs(path='./movie_data.csv'))

(u'"Anne (Natalie Portman) tells us about how much she hates her mother, Adele (Susane Sarandon). That\'s how the movie begins. Adele decided that her and her daughter were moving to California without asking anyone and leaving her husband without any reason. The story is about the relationship between the mother and the daughter. It\'s really deep and touching, thanks to the great work of the actresses. Natalie was nominated to a Golden Globe for that role. She is one of the most talented actresses I ever saw, and so is Sarandon. They really look like mother daughter. The soundtrack is also great. The movie is incredible. *10 out of 10*"',
 1)

In [22]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [23]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore',
                         n_features = 2**21,
                         preprocessor = None,
                         tokenizer=tokenizer
                        )
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [24]:
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:36


In [25]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.883


In [26]:
clf = clf.partial_fit(X_test, y_test)

<h3>Serializing fitted scikit-learn estimators</h3>

In [None]:
'''
serialize the classifier as a pickle file
'''

import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
# we serialize our stopwords so that we do not have to install NLTK on our servers    
pickle.dump(stop,
           open(os.path.join(dest, 'stopwords.pkl'), 'wb')
           )

pickle.dump(clf,
            open(os.path.join(dest, 'classifier.pkl'), 'wb')
           )

In [1]:
%%writefile movieclassifier/vectorizer.py
"""
Since 'HashingVectorizer' does not need to be fitted, we dont have to pickle it.
Rather this script will be used to import the vectorizer in the main file.
"""
import re
import os
import pickle

from sklearn.feature_extraction.text import HashingVectorizer

print os.getcwd()
cur_dir = os.path.dirname(__file__)
stop = pickle.load(open(
                os.path.join(cur_dir, 
                'pkl_objects', 
                'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \
                   + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore',
                         n_features=2**21,
                         preprocessor=None,
                         tokenizer=tokenizer)

Overwriting movieclassifier/vectorizer.py


In [None]:
os.chdir('movieclassifier')

In [6]:
import os

import pickle
import re
from vectorizer import vect

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

In [7]:
import numpy as np
label = {0:'negative', 1:'positive'}

example = ['I love this movie']
X = vect.transform(example)
print('Prediction: {}\nProbability: {:.2f}'.format(label[clf.predict(X)[0]], clf.predict_proba(X).max()*100))

Prediction: positive
Probability: 85.93


<h3>Finished</h3>