In [2]:
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
import numpy as np

In [5]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

negfeats_raw = [movie_reviews.raw(fileids=[f]) for f in negids]
posfeats_raw = [movie_reviews.raw(fileids=[f]) for f in posids]

negfeats_imploded = [' '.join(words) for words in negfeats]
posfeats_imploded = [' '.join(words) for words in posfeats]
imploded = negfeats_imploded + posfeats_imploded
labels = [0] * len(negfeats) + [1] * len(posfeats)

In [6]:
pipeline1 = Pipeline([("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())])
scores1 = cross_val_score(estimator=pipeline1, X=imploded, y=labels, cv=10)

pipeline2 = Pipeline([("vectorizer", TfidfVectorizer()), ("classifier", LogisticRegression())])
scores2 = cross_val_score(estimator=pipeline2, X=imploded, y=labels, cv=10)

In [18]:
print round(scores1.mean(), 2), ' ', round(scores1.std(), 2), ' ', round(scores2.mean(), 2), ' ', round(scores2.std(), 2)

0.84   0.02   0.82   0.02


In [23]:
pipeline1 = Pipeline([("vectorizer", CountVectorizer(min_df=50)), ("classifier", LogisticRegression())])
scores1 = cross_val_score(estimator=pipeline1, X=imploded, y=labels, cv=10)
print round(scores1.mean(), 2)

0.82


In [25]:
pipeline = Pipeline([("vectorizer", CountVectorizer()), ("classifier", LogisticRegression())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.84


In [29]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

In [27]:
pipeline = Pipeline([("vectorizer", CountVectorizer()), ("classifier", LinearSVC())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.83


In [30]:
pipeline = Pipeline([("vectorizer", CountVectorizer()), ("classifier", SGDClassifier())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.77


In [31]:
from nltk.corpus import stopwords

In [36]:
pipeline = Pipeline([("vectorizer", CountVectorizer(stop_words=stopwords.words('english'))), ("classifier", LogisticRegression())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.85


In [37]:
pipeline = Pipeline([("vectorizer", CountVectorizer(stop_words='english')), ("classifier", LogisticRegression())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.85


In [9]:
pipeline = Pipeline([("vectorizer", CountVectorizer(stop_words='english', ngram_range=(1, 2), analyzer='word')), ("classifier", LogisticRegression())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.85


In [10]:
pipeline = Pipeline([("vectorizer", CountVectorizer(stop_words='english', ngram_range=(3, 5), analyzer='char_wb')), ("classifier", LogisticRegression())])
scores = cross_val_score(estimator=pipeline, X=imploded, y=labels, cv=10)
print round(scores.mean(), 2)

0.83
