In [15]:
import numpy as np
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

In [16]:
sentiment = load_files('sentiment/data/imdb1/', random_state=41)
sentiment.target_names

['neg', 'pos']

In [17]:
train_data, test_data, train_target, test_target = train_test_split(sentiment.data, sentiment.target, random_state=41)

In [18]:
vectorizer = CountVectorizer()
vectorized_train_data = vectorizer.fit_transform(train_data)

In [19]:
len(vectorizer.vocabulary_)

35441

In [20]:
clf = MultinomialNB()
clf.fit(vectorized_train_data, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
clf.score(vectorizer.transform(test_data), test_target)

0.81799999999999995

In [22]:
f1s = []
kf = KFold(len(sentiment.data), n_folds=10)
for train, test in kf:
    train_fold = [sentiment.data[i] for i in train]
    test_fold = [sentiment.data[i] for i in test]
    vectorized_train_data = vectorizer.fit_transform(train_fold)
    clf.fit(vectorized_train_data, sentiment.target[train])
    y_pred = clf.predict(vectorizer.transform(test_fold))
    f1 = f1_score(sentiment.target[test], y_pred)
    print f1
    f1s.append(f1)

0.819047619048
0.816143497758
0.85
0.782608695652
0.835978835979
0.78612716763
0.817777777778
0.788571428571
0.78021978022
0.820754716981


In [23]:
np.mean(f1s)

0.80972295196166522

In [24]:
vectorizer2 = CountVectorizer(stop_words = 'english')
clf2 = MultinomialNB()

f1s2 = []
for train, test in KFold(len(sentiment.data), n_folds=10):
    train_fold = [sentiment.data[i] for i in train]
    test_fold = [sentiment.data[i] for i in test]
    vectorized_train_data = vectorizer2.fit_transform(train_fold)
    clf2.fit(vectorized_train_data, sentiment.target[train])
    y_pred = clf2.predict(vectorizer2.transform(test_fold))
    f1 = f1_score(sentiment.target[test], y_pred)
    print f1
    f1s2.append(f1)

0.819047619048
0.821428571429
0.847290640394
0.778378378378
0.789189189189
0.78612716763
0.826666666667
0.784090909091
0.780748663102
0.816901408451


In [25]:
np.mean(f1s2)

0.80498692133777894