In [1]:
import pandas as pd
import numpy as np
dataset = pd.read_csv("data/labeledTrainData.tsv", sep='\t', encoding="latin-1")

dataset.sample(10)

Unnamed: 0,id,sentiment,review
2057,4943_2,0,"I have watched 3 episodes of Caveman, and I ha..."
8195,5244_7,1,"\The Ladies Man\"" suffers a common problem amo..."
23361,11602_8,1,Now I had the pleasure of first viewing Contam...
12231,12491_8,1,This movie was very good. If you are one who l...
12838,7210_8,1,New York playwright Michael Caine (as Sidney B...
21638,7608_10,1,"I watched the first series avidly, but wondere..."
1382,656_10,1,"Of all the reviews I've read, most people have..."
4312,12451_10,1,Although I bought the DVD when it first came o...
3314,3022_7,1,"Never realized that Charles Boyer, (Luis Denar..."
11587,7662_3,0,This should be my kind of movie. Even if it su...


In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arturolocsin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [4]:
import re
def preprocessor(text):
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

In [5]:
# split the dataset in train and test
from sklearn.model_selection import train_test_split
X_train = dataset['review']
y_train = dataset['sentiment']

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=stop,
                        tokenizer=tokenizer_porter,
                        preprocessor=preprocessor)

clf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0))])
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [7]:
import pickle
import os

pickle.dump(clf, open(os.path.join('data', '../SentimentAnalysisApp/MovieReviewSentiments.pkl'), 'wb'), protocol=4)