In [31]:
"""Build a sentiment analysis / polarity model

Sentiment analysis can be casted as a binary text classification problem,
that is fitting a linear classifier on features extracted from the text
of the user messages so as to guess wether the opinion of the author is
positive or negative.

In this examples we will use a movie review dataset.

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD
# Edited by: Carlos Ezequiel <cafezequiel@gmail.com>

import sys
import pickle
from os import path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [6]:
# Load the data
# the training data folder must be passed as first argument
movie_reviews_data_folder = '/Users/carlos/src/scikit-learn/doc/tutorial/text_analytics/data/movie_reviews/txt_sentoken'
dataset = load_files(movie_reviews_data_folder, shuffle=False)
print("n_samples: %d" % len(dataset.data))

n_samples: 2000


In [4]:
# split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.25, random_state=None)

In [11]:
# TASK: Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
vectorizer = TfidfVectorizer(min_df=3, max_df=0.95)
classifier = LinearSVC(C=1000)
pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', classifier)
])

In [12]:
# TASK: Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=None, min_df=3,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [21]:
# TASK: print the cross-validated scores for the each parameters set
# explored by the grid search
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
    print(i, 'params - %s; mean - %0.2f; std - %0.2f'
          % (grid_search.cv_results_['params'][i], 
             grid_search.cv_results_['mean_test_score'][i], 
             grid_search.cv_results_['std_test_score'][i]))

(0, "params - {'vect__ngram_range': (1, 1)}; mean - 0.85; std - 0.01")
(1, "params - {'vect__ngram_range': (1, 2)}; mean - 0.85; std - 0.01")


In [34]:
# TASK: Predict the outcome on the testing set and store it in a variable
# named y_predicted
y_predicted = grid_search.predict(docs_test)

# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

# Print and plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

#import matplotlib.pyplot as plt
#plt.matshow(cm)
#plt.show()

             precision    recall  f1-score   support

        neg       0.88      0.84      0.86       259
        pos       0.83      0.88      0.86       241

avg / total       0.86      0.86      0.86       500

[[217  42]
 [ 29 212]]


In [32]:
# Save model to a file
model_dir = 'models'
model_filename = 'sentiment_analysis_movie_reviews.pkl'
model_filepath = path.join(model_dir, model_filename)
with open(model_filepath, 'wb') as f:
    pickle.dump(grid_search, f)