In [6]:
import os
import sys
import tarfile
import time

import pyprind       # pip install pyprind, if you haven't used it before
import pandas as pd
import numpy as np


source = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
target = '../../data/aclImdb_v1.tar.gz'

aclImdb_extracted_dir = '../../data/aclImdb'

movie_data_dir = '../../data/movie_data.csv'


def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    if duration == 0:
        duration = 10**-3
    progress_size = int(count * block_size)
    speed = progress_size / (1024.**2 * duration)
    percent = count * block_size * 100. / total_size
    sys.stdout.write("\r%d%% | %d MB | %.2f MB/s | %d sec elapsed" %
                    (percent, progress_size / (1024.**2), speed, duration))
    sys.stdout.flush()

In [7]:
# This download takes a couple of seconds at NMBU (<30)
if not os.path.isdir(aclImdb_extracted_dir) and not os.path.isfile(target):
    
    if (sys.version_info < (3, 0)):
        import urllib
        urllib.urlretrieve(source, target, reporthook)
    
    else:
        import urllib.request
        urllib.request.urlretrieve(source, target, reporthook)

In [8]:
# The extraction can take several minutes as all 50,000 reviews are stored as separate text files
# (101,111 files). 
# Extracting to a synced folder (Dropbox, Google Drive, OneDrive, ...) may slow the process further.
if not os.path.isdir(aclImdb_extracted_dir):

    with tarfile.open(target, 'r:gz') as tar:
        tar.extractall()

In [10]:
if not os.path.isfile(movie_data_dir):
    # change the `basepath` to the directory of the
    # unzipped movie dataset

    basepath = aclImdb_extracted_dir

    labels = {'pos': 1, 'neg': 0}
    pbar = pyprind.ProgBar(50000)
    df = pd.DataFrame()
    for s in ('test', 'train'):
        for l in ('pos', 'neg'):
            path = os.path.join(basepath, s, l)
            for file in sorted(os.listdir(path)):
                with open(os.path.join(path, file), 
                          'r', encoding='utf-8') as infile:
                    txt = infile.read()
                df = df.append([[txt, labels[l]]], 
                               ignore_index=True)
                pbar.update()
    df.columns = ['review', 'sentiment']
    
    # shuffle it
    np.random.seed(0)
    df = df.reindex(np.random.permutation(df.index))
    df.to_csv(movie_data_dir, index=False, encoding='utf-8')
else:
    df = pd.read_csv(movie_data_dir, encoding='utf-8')
    df.head(3)

In [17]:
import re
def preprocessor(text):
    # Regular expression for HTML tags
    text = re.sub('<[^>]*>', '', text)
    
    # Most typical emoticons (smileys)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    
    # Remove all non-word characters, convert to lower-case and add possible emoticons to the end.
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [18]:
# This takes a few seconds
df['review'] = df['review'].apply(preprocessor)

In [19]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [20]:
def to_vw_format(document, label=None):
    return str(label or '') + ' |text ' + ' '.join(re.findall('\w{3,}', document.lower())) + '\n'


In [21]:
to_vw_format(str(X_train[0]), 1 if y_train[0] == 1 else -1)

'1 |text 1974 the teenager martha moxley maggie grace moves the high class area belle haven greenwich connecticut the mischief night eve halloween she was murdered the backyard her house and her murder remained unsolved twenty two years later the writer mark fuhrman christopher meloni who former detective that has fallen disgrace for perjury simpson trial and moved idaho decides investigate the case with his partner stephen weeks andrew mitchell with the purpose writing book the locals squirm and not welcome them but with the support the retired detective steve carroll robert forster that was charge the investigation the they discover the criminal and net power and money cover the murder murder greenwich good movie with the true story murder fifteen years old girl that was committed wealthy teenager whose mother was kennedy the powerful and rich family used their influence cover the murder for more than twenty years however snoopy detective and convicted perjurer disgrace was able disc

In [22]:
with open('../../data/movie_reviews_train.vw', 'w') as vw_train_data:
    for text, target in zip(X_train, y_train):
        vw_train_data.write(to_vw_format(str(text), 1 if target == 1 else -1))
with open('../../data/movie_reviews_test.vw', 'w') as vw_test_data:
    for text, target in zip(X_test, y_test):
        vw_test_data.write(to_vw_format(str(text), 1 if target == 1 else -1))

In [23]:
!head -2 ../../data/movie_reviews_train.vw

1 |text 1974 the teenager martha moxley maggie grace moves the high class area belle haven greenwich connecticut the mischief night eve halloween she was murdered the backyard her house and her murder remained unsolved twenty two years later the writer mark fuhrman christopher meloni who former detective that has fallen disgrace for perjury simpson trial and moved idaho decides investigate the case with his partner stephen weeks andrew mitchell with the purpose writing book the locals squirm and not welcome them but with the support the retired detective steve carroll robert forster that was charge the investigation the they discover the criminal and net power and money cover the murder murder greenwich good movie with the true story murder fifteen years old girl that was committed wealthy teenager whose mother was kennedy the powerful and rich family used their influence cover the murder for more than twenty years however snoopy detective and convicted perjurer disgrace was able discl

In [24]:
!head -2 ../../data/movie_reviews_test.vw

-1 |text there part that would like give this movie high rating considering that was made 1953 this very courageous movie about transvestites tackling the issue fairly seriously and sympathetically and offering the viewer lot information the subject and trying very hard not stereotype the movie clearly makes the point that transvestites are not homosexuals and that aside from wearing women clothing they lead relatively normal life deals with the pain not being accepted society the plot revolves around police officer lyle talbot desperately trying understand the issue because the recent suicide transvestite you have give everyone involved with this movie credit for taking such controversial the context 1953 subject having said all that also sorry say that this movie absolutely dreadful trying portray glen glenda edward wood pain the movie falls into silly and times surprisingly again given the era sensual fantasies that make the story very hard follow the acting wooden best none the dia

In [25]:
!vw -d ../../data/movie_reviews_train.vw \
--loss_function hinge -f movie_reviews_model.vw --quiet

In [26]:
!vw -i movie_reviews_model.vw -t -d ../../data/movie_reviews_test.vw \
-p movie_test_pred.txt --quiet

In [27]:
# verbose mode (training)

!vw -d ../../data/movie_reviews_train.vw \
--loss_function hinge -f movie_reviews_model.vw

final_regressor = movie_reviews_model.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../../data/movie_reviews_train.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0   1.0000   0.0000      189
1.309170 1.618340            2            2.0  -1.0000   0.6183      178
1.188423 1.067676            4            4.0   1.0000  -0.3807       66
1.147236 1.106050            8            8.0   1.0000   0.2340      103
1.106413 1.065589           16           16.0  -1.0000  -0.2138      101
1.026719 0.947025           32           32.0  -1.0000   0.6213      524
0.900352 0.773986           64           64.0  -1.0000   0.1053      382
0.921935 0.943518          128          128.0   1.0000   0.1555      163
0.820037 0.718139          256          256.0   1.0000   0.5732      125
0.7

In [42]:
# verbose mode (testing)

!vw -i movie_reviews_model.vw -t -d ../../data/movie_reviews_test.vw \
-p movie_test_pred.txt

only testing
predictions = movie_test_pred.txt
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../../data/movie_reviews_test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0  -1.0000  -1.0000      253
0.000000 0.000000            2            2.0   1.0000   1.0000       93
0.000000 0.000000            4            4.0  -1.0000  -1.0000      107
0.001769 0.003538            8            8.0   1.0000   1.0000      311
0.231365 0.460960           16           16.0   1.0000   1.0000       96
0.418271 0.605178           32           32.0  -1.0000  -1.0000      256
0.293433 0.168594           64           64.0  -1.0000  -1.0000       81
0.221601 0.149770          128          128.0  -1.0000  -1.0000      236
0.279010 0.336419          256          256.0   1.0000  -0.8504      55

In [28]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [29]:
with open('movie_test_pred.txt') as pred_file:
    test_prediction = [float(label) 
                             for label in pred_file.readlines()]
print("Accuracy: {}".format(round(accuracy_score(y_test, 
               [int(pred_prob > 0) for pred_prob in test_prediction]), 3)))
print("AUC: {}".format(round(roc_auc_score(y_test, test_prediction), 3)))

Accuracy: 0.894
AUC: 0.95
