In [2]:
# Preprocessing of imdb data

import numpy as np
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding='utf-8')

df = df.rename(columns={"0": "review", "1": "sentiment"})

df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [3]:
df.shape

(50000, 2)

In [7]:
# use CountVectorizer to construct bag-of-words model

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()

docs = np.array([ 'The sun is shining', 'The weather is sweet', 
                 'The sun is shining, the weather is sweet and one and one is two'])

bag = count.fit_transform( docs )

In [8]:
print( count.vocabulary_ )

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [9]:
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


In [10]:
# transform data into tf-idfs (term frequency-inverse document frequency)

from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)

np.set_printoptions( precision=2 )

print(tfidf.fit_transform( count.fit_transform( docs ) ).toarray() )

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [11]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [12]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [13]:
# test preprocessor

preprocessor( df.loc[0, 'review'][-50:])

'is seven title brazil not available'

In [14]:
preprocessor( "</a> This :) is :( a test :-)!")

' this is a test :) :( :)'

In [15]:
# apply to dataframe

df['review'] = df['review'].apply( preprocessor )

In [16]:
def tokenizer( text ):
    return text.split()

tokenizer('runners like running and this they run')

['runners', 'like', 'running', 'and', 'this', 'they', 'run']

In [17]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer_porter(text):
    return [ porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [18]:
# download then apply stopwords

import nltk

nltk.download('stopwords')



[nltk_data] Downloading package stopwords to C:\Users\Andrew
[nltk_data]     Solis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords

stop = stopwords.words( 'english' )
[w for w in tokenizer_porter('a runner likes running and runs a lot') if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

In [20]:
# separate reviews into training and test data

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values

X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values



In [None]:
# use gridsearchCV with logistic regression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer( strip_accents=None, lowercase=False, preprocessor=None)

small_param_grid = [ 
    {
        'vect__ngram_range' : [(1, 1)],
        'vect__stop_words'  : [None],
        'vect__tokenizer'   : [tokenizer, tokenizer_porter],
        'clf__penalty'      : ['l2'],
        'clf__C'            : [1.0, 10.0]
    },
    {
        'vect__ngram_range': [(1, 1)],
        'vect__stop_words' : [stop, None],
        'vect__tokenizer'  : [tokenizer],
        'vect__use_idf'    : [False],
        'vect__norm'       : [None],
        'clf__penalty'     : ['l2'],
        'clf__C'           : [1.0, 10.0]
    }
]


lr_tfidf = Pipeline([
    ('vect', tfidf ),
    ('clf', LogisticRegression(solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=1)

gs_lr_tfidf.fit(X_train, y_train)

In [22]:
print(f'Best parameter set: {gs_lr_tfidf.best_params_}')

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x00000180BDC2A160>}


In [23]:
# use best model with 5-fold cross-validation

print(f'Accuracy: {gs_lr_tfidf.best_score_:.3f}')

clf = gs_lr_tfidf.best_estimator_

print(f'Test Accuracy: {clf.score(X_test, y_test ):.3f}')

Accuracy: 0.897
Test Accuracy: 0.899


In [24]:
# clean unprocessed data

import numpy as np
import re

from nltk.corpus import stopwords

stop = stopwords.words('english')

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    
    tokenized = [ w for w in text.split() if w not in stop ]

    return tokenized

In [None]:
# read and return one doc at a time
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        