In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np

# Text Classification of Movie Reviews

In [None]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target

In [None]:
print("Number of documents in training data: %d" % len(text_train))
print(np.bincount(y_train))

In [None]:
reviews_test = load_files("data/aclImdb/test/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: %d" % len(text_test))
print(np.bincount(y_test))

In [None]:
print(text_train[1])

In [None]:
print(y_train[1])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(text_train)

len(cv.vocabulary_)

In [None]:
print(cv.get_feature_names()[:50])
print(cv.get_feature_names()[50000:50050])

In [None]:
X_train = cv.transform(text_train)
X_train

In [None]:
print(text_train[19726])

In [None]:
X_train[19726].nonzero()[1]

In [None]:
X_test = cv.transform(text_test)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.01)
lr.fit(X_train, y_train)

In [None]:
lr.score(X_train, y_train)

In [None]:
lr.score(X_test, y_test)

In [None]:
def visualize_coefficients(classifier, feature_names, n_top_features=25):
    # get coefficients with large absolute values 
    coef = classifier.coef_.ravel()
    positive_coefficients = np.argsort(coef)[-n_top_features:]
    negative_coefficients = np.argsort(coef)[:n_top_features]
    interesting_coefficients = np.hstack([negative_coefficients, positive_coefficients])
    # plot them
    plt.figure(figsize=(10, 5))
    colors = ["red" if c < 0 else "blue" for c in coef[interesting_coefficients]]
    plt.bar(np.arange(2 * n_top_features), coef[interesting_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.subplots_adjust(bottom=0.3, left=0.05, right=.95)
    plt.xticks(np.arange(1, 1 + 2 * n_top_features), feature_names[interesting_coefficients], rotation=60, ha="right");


In [None]:
visualize_coefficients(lr, cv.get_feature_names())

# N-Grams

In [None]:
cv = CountVectorizer(ngram_range=(1, 2))
cv.fit(text_train)

len(cv.vocabulary_)

In [None]:
X_test = cv.transform(text_test)
X_train = cv.transform(text_train)

In [None]:
lr = LogisticRegression(C=0.1)
lr.fit(X_train, y_train)

In [None]:
visualize_coefficients(lr, cv.get_feature_names())

In [None]:
lr.score(X_test, y_test)

In [None]:
lr.score(X_train, y_train)