In [90]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import random


# extracting data
import nltk
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/lenferdetroud/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [15]:
# getting IDs of positive and negative reviews
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

# lists of reviews
negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]

print(negfeats[:5])
print(posfeats[:5])

[['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], ['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], ['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], ['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], ['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...]]
[['films', 'adapted', 'from', 'comic', 'books', 'have', ...], ['every', 'now', 'and', 'then', 'a', 'movie', 'comes', ...], ['you', "'", 've', 'got', 'mail', 'works', 'alot', ...], ['"', 'jaws', '"', 'is', 'a', 'rare', 'film', 'that', ...], ['moviemaking', 'is', 'a', 'lot', 'like', 'being', ...]]


In [31]:
allfeats = [' '.join(x) for x in negfeats] + [' '.join(x) for x in posfeats]

# number of reviews
print(len(allfeats))

2000


In [30]:
# random review
print(allfeats[42])

a pseudo - intellectual film about the pseudo - intellectual world of art magazines , high art is as wasted as its drug - addled protagonists . in the only notable part of the movie , ally sheedy and radha mitchell deliver nice performances in the two leading roles , not that lisa cholodenko ' s script or direction makes you care much about either character . living in a world of heroin induced highs , they float along until they fall in love with each other . this uninviting picture , full of pretentious minor characters , has a receptionist that reads dostoevski and a woman in the restroom line who is a certified genius , having recently been awarded a prestigious mcarthur grant . 24 - year - old syd ( radha mitchell ) , who has a rather bland , live - in boyfriend , was just promoted to assistant editor at the artistic photography magazine " frame . " although the receptionist is impressed , syd is mainly a gofer for her boss until she meets famous photographer lucy berliner ( ally 

In [68]:
# target vector
labels = [0] * len(negfeats) + [1] * len(posfeats)

print(labels.count(0)) # negative
print(labels.count(1)) # positive

1000
1000


In [49]:
vectorizer = CountVectorizer()
data = vectorizer.fit_transform(allfeats) # training data

# number of features in CountVectorizer
print(data.shape[1])

39659


In [63]:
logreg = LogisticRegression(solver='liblinear')

# using pipeline
pipe = make_pipeline(vectorizer, logreg)

# accuracy
cross_val_score(pipe, allfeats, labels, scoring='accuracy', cv=3).mean()

0.8360054207130668

In [64]:
# ROC AUC
cross_val_score(pipe, allfeats, labels, scoring='roc_auc', cv=3).mean()

0.9107670094794944

In [66]:
# fitting logistic regression
logreg.fit(data, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [148]:
feature_names = vectorizer.get_feature_names()

# random features
for i in range(10):
    print(feature_names[random.randint(0,39658)])

pretense
pure
wringing
jeweller
rationality
riddler
dowdy
greeting
last
humanity


In [155]:
# coeffs of the features
coeffs = logreg.coef_[0]
print(coeffs)

[ 1.13520250e-02 -1.78937615e-02  2.51639163e-06 ... -7.15499215e-03
  3.79017830e-04 -1.40853413e-03]


In [162]:
# finding 5 most valuable features
for i in sorted(coeffs)[:5]:
    print(feature_names[list(coeffs).index(i)])

bad
unfortunately
worst
waste
nothing


In [163]:
print(sorted(coeffs)[:5])

[-0.7821764783709629, -0.6366186437795317, -0.5929016766182736, -0.5081785851276233, -0.5039889411126592]
