In [1]:
import pyprind
import pandas as pd
import os

pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()

df.columns = ['review', 'sentiment']

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:02:59


In [2]:
import numpy as np

np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False)

In [3]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,review,sentiment
0,I'll dispense with the obvious review of factu...,0
1,"At a risk of sounding slightly sacrilegious, o...",1
2,"This show is terrible, the jokes are all terri...",0
3,Always enjoy the great acting of Drew Barrymor...,1
4,"There is a difference between a ""film,"" and a ...",0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)

In [5]:
print count.vocabulary_

{u'and': 0, u'weather': 6, u'sweet': 4, u'sun': 3, u'is': 1, u'the': 5, u'shining': 2}


In [6]:
print bag.toarray()

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


In [7]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print tfidf.fit_transform(count.fit_transform(docs)).toarray()

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [8]:
df.loc[0, 'review'][-50:]

'ut there.<br /><br />God works in mysterious ways.'

In [9]:
# Cleaning the text data

import re

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [10]:
preprocessor(df.loc[0, 'review'][-50:])

'ut there god works in mysterious ways '

In [11]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [14]:
df.head()

Unnamed: 0,review,sentiment
0,I'll dispense with the obvious review of factu...,0
1,"At a risk of sounding slightly sacrilegious, o...",1
2,"This show is terrible, the jokes are all terri...",0
3,Always enjoy the great acting of Drew Barrymor...,1
4,"There is a difference between a ""film,"" and a ...",0


In [69]:
df['review'][2194]

nan

In [99]:
for i in range(0, 400):
    preprocessor(df['review'][i])

In [84]:
df_clean = df.drop([2194, 29359])

In [98]:
df_clean['review'] = df_clean['review'].apply(preprocessor)

In [100]:
def tokenizer(text):
    return text.split()

In [101]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [102]:
# Perform word stemming

from nltk.stem.Snowball import SnowballStemmer
stemmer = SnowballStemmer("English")

def tokenizer_snowball(text):
    return [stemmer.stem(word) for word in text.split()]
tokenizer_snowball('runners like running and thus they run')

[u'runner', u'like', u'run', u'and', u'thu', u'they', u'run']

In [103]:
import nltk

In [104]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mingmingguo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [105]:
# Filtering Stopwords in the data

from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_snowball('a runner likes running and runs a lot')[-10:] if w not in stop]

[u'runner', u'like', u'run', u'run', u'lot']

In [106]:
X_train = df_clean.loc[:25000, 'review'].values
y_train = df_clean.loc[:25000, 'sentiment'].values
X_test = df_clean.loc[25000:, 'review'].values
y_test = df_clean.loc[25000:, 'sentiment'].values

In [107]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, 
                        lowercase=False, 
                        preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_snowball],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(2,2)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer,
                                   tokenizer_snowball],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf',
                      LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, 
                           scoring='accuracy',
                           cv=5, verbose=1,
                           n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:   19.2s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 32.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 40.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ovr',
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x10cb6b500>, <function tokenizer_porter at 0x10ef310c8>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'y...x10ef310c8>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, score_func=No

In [108]:
print 'Best parameter set: %s ' % gs_lr_tfidf.best_params_

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__tokenizer': <function tokenizer at 0x10cb6b500>, 'clf__penalty': 'l2', 'clf__C': 10.0, 'vect__stop_words': None} 


In [109]:
print 'CV Accuracy: %.3f' % gs_lr_tfidf.best_score_
clf = gs_lr_tfidf.best_estimator_
print 'Test Accuracy: %.3f' % clf.score(X_test, y_test)

CV Accuracy: 0.895
Test Accuracy: 0.896
