In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
from nltk.corpus import movie_reviews
from sklearn.datasets import load_files
from sklearn import tree
import random
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.classify import MaxentClassifier

In [2]:
def train_test(directory,stopwords,idf):
    movie_train = load_files(directory, shuffle=True)
    # No IDF
    movie_vec = CountVectorizer(binary=True,tokenizer=nltk.word_tokenize,stop_words=stopwords)     
    movie_counts = movie_vec.fit_transform(movie_train.data[:2000])

    tf_transformer = TfidfTransformer(use_idf=idf)
    movie_tf = tf_transformer.fit_transform(movie_counts)

    docs_trn, docs_tst, y_trn, y_tst = train_test_split(movie_tf, movie_train.target)
    return docs_trn, docs_tst, y_trn, y_tst

def results(train_doc,train_target,test_doc,test_target):
    # naive bayes
    clfb = MultinomialNB().fit(train_doc, train_target)
    y_pred = clfb.predict(test_doc)
    # Decision Tree
    clf2 = tree.DecisionTreeClassifier().fit(docs_train, y_train)
    y_pred2 = clf2.predict(docs_test)
    # Max Entropy
    clf3=LogisticRegression().fit(docs_train, y_train)
    y_pred3 = clf3.predict(docs_test)

    print("Naïve Bayes")
    print("Accuracy: {}".format(sklearn.metrics.accuracy_score(test_target, y_pred)))
    print("F1: {}".format(round(sklearn.metrics.f1_score(test_target, y_pred),3)))
    print("\n")
    print("Decision Tree")
    print("Accuracy: {}".format(sklearn.metrics.accuracy_score(test_target, y_pred2)))
    print("F1: {}".format(round(sklearn.metrics.f1_score(test_target, y_pred2),3)))
    print("\n")
    print("Logit (MaxEnt)")
    print("Accuracy: {}".format(sklearn.metrics.accuracy_score(test_target, y_pred3)))
    print("F1: {}".format(round(sklearn.metrics.f1_score(test_target, y_pred3),3)))
    

## Including stopwords, no *IDF

In [3]:
docs_train, docs_test, y_train, y_test= train_test("/Users/anthonyvicario/movie_reviews",None,False)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [4]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.752
F1: 0.702


Decision Tree
Accuracy: 0.634
F1: 0.642


Logit (MaxEnt)
Accuracy: 0.82
F1: 0.818


## Removing stopwords, no *IDF

In [6]:
docs_train, docs_test, y_train, y_test= train_test("/Users/anthonyvicario/movie_reviews","english",False)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [7]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.82
F1: 0.83


Decision Tree
Accuracy: 0.628
F1: 0.612


Logit (MaxEnt)
Accuracy: 0.834
F1: 0.831


## Including Stop words + *IDF

In [8]:
docs_train, docs_test, y_train, y_test= train_test("/Users/anthonyvicario/movie_reviews",None,True)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [9]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.828
F1: 0.833


Decision Tree
Accuracy: 0.626
F1: 0.641


Logit (MaxEnt)
Accuracy: 0.842
F1: 0.842


## Removing Stop words + *IDF

In [11]:
docs_train, docs_test, y_train, y_test= train_test("/Users/anthonyvicario/movie_reviews","english",True)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [12]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.854
F1: 0.857


Decision Tree
Accuracy: 0.608
F1: 0.627


Logit (MaxEnt)
Accuracy: 0.868
F1: 0.873
