In [2]:
import pickle
import joblib as joblib
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews


# model_building.ipynb

negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

allfeats = [' '.join(x) for x in negfeats] + [' '.join(x) for x in posfeats]
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [3]:
# review_classification.ipynb

rs = 42

def make_pipeline(vectorizer, transformer, classifier):
    return Pipeline([
            ('vectorizer', vectorizer),
            ('transformer', transformer),
            ('classifier', classifier)
        ])

# using the best model (linear SVC)
lsvc = make_pipeline(CountVectorizer(min_df=1, ngram_range=(1, 5), max_df=0.9, stop_words=None),
                     TfidfTransformer(),
                     LinearSVC(max_iter=400, 
                              loss='squared_hinge', 
                              C=1.0, 
                              tol=0.001, 
                              random_state=rs))
# fitting
clf = lsvc.fit(allfeats, labels)

In [4]:
clf

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.9,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
  

In [5]:
with open('/home/lenferdetroud/demo/clf_sa.pkl', 'wb') as f:
    pickle.dump(clf, f, protocol=4)

In [4]:
clf = joblib.load('/home/lenferdetroud/demo/clf_sa.pkl')