In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
from nltk.corpus import movie_reviews
from sklearn.datasets import load_files
from sklearn import tree
import random
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.classify import MaxentClassifier

In [23]:
def train_test(directory):
    movie_train = load_files(directory, shuffle=True)
    # No IDF
    movie_vec = CountVectorizer(binary=True,tokenizer=nltk.word_tokenize)     
    movie_counts = movie_vec.fit_transform(movie_train.data[:2000])

    tf_transformer = TfidfTransformer(use_idf=False)
    movie_tf = tf_transformer.fit_transform(movie_counts)

    docs_trn, docs_tst, y_trn, y_tst = train_test_split(movie_tf, movie_train.target)
    return docs_trn, docs_tst, y_trn, y_tst

# Stop word removed
def train_test_2(directory):
    movie_train = load_files(directory, shuffle=True)
    # No IDF
    movie_vec = CountVectorizer(binary=True,tokenizer=nltk.word_tokenize,stop_words="english")     
    movie_counts = movie_vec.fit_transform(movie_train.data[:2000])

    tf_transformer = TfidfTransformer(use_idf=False)
    movie_tf = tf_transformer.fit_transform(movie_counts)

    docs_trn, docs_tst, y_trn, y_tst = train_test_split(movie_tf, movie_train.target)
    return docs_trn, docs_tst, y_trn, y_tst
    
    
# Including Stop words + *IDF
def train_test_3(directory):
    movie_train = load_files(directory, shuffle=True)
    # No IDF
    movie_vec = CountVectorizer(binary=True,tokenizer=nltk.word_tokenize)     
    movie_counts = movie_vec.fit_transform(movie_train.data[:2000])

    tf_transformer = TfidfTransformer(use_idf=True)
    movie_tf = tf_transformer.fit_transform(movie_counts)

    docs_trn, docs_tst, y_trn, y_tst = train_test_split(movie_tf, movie_train.target)
    return docs_trn, docs_tst, y_trn, y_tst
    
# removing Stop words + *IDF
def train_test_4(directory):
    movie_train = load_files(directory, shuffle=True)
    # No IDF
    movie_vec = CountVectorizer(binary=True,tokenizer=nltk.word_tokenize,stop_words="english")     
    movie_counts = movie_vec.fit_transform(movie_train.data[:2000])

    tf_transformer = TfidfTransformer(use_idf=True)
    movie_tf = tf_transformer.fit_transform(movie_counts)

    docs_trn, docs_tst, y_trn, y_tst = train_test_split(movie_tf, movie_train.target)
    return docs_trn, docs_tst, y_trn, y_tst
    
def results(train_doc,train_target,test_doc,test_target):
    # naive bayes
    clfb = MultinomialNB().fit(train_doc, train_target)
    y_pred = clfb.predict(test_doc)
    # Decision Tree
    clf2 = tree.DecisionTreeClassifier().fit(docs_train, y_train)
    y_pred2 = clf2.predict(docs_test)
    # Max Entropy
    clf3=LogisticRegression().fit(docs_train, y_train)
    y_pred3 = clf3.predict(docs_test)

    print("Naïve Bayes")
    print("Accuracy: {}".format(sklearn.metrics.accuracy_score(test_target, y_pred)))
    print("F1: {}".format(round(sklearn.metrics.f1_score(test_target, y_pred),3)))
    print("\n")
    print("Decision Tree")
    print("Accuracy: {}".format(sklearn.metrics.accuracy_score(test_target, y_pred2)))
    print("F1: {}".format(round(sklearn.metrics.f1_score(test_target, y_pred2),3)))
    print("\n")
    print("Logit (MaxEnt)")
    print("Accuracy: {}".format(sklearn.metrics.accuracy_score(test_target, y_pred3)))
    print("F1: {}".format(round(sklearn.metrics.f1_score(test_target, y_pred3),3)))
    

## Including stopwords, no *IDF

In [17]:
docs_train, docs_test, y_train, y_test= train_test("/Users/anthonyvicario/movie_reviews")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [18]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.83
F1: 0.832


Decision Tree
Accuracy: 0.632
F1: 0.621


Logit (MaxEnt)
Accuracy: 0.808
F1: 0.803


## Removing stopwords, no *IDF

In [19]:
docs_train, docs_test, y_train, y_test= train_test_2("/Users/anthonyvicario/movie_reviews")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [20]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.854
F1: 0.858


Decision Tree
Accuracy: 0.604
F1: 0.594


Logit (MaxEnt)
Accuracy: 0.85
F1: 0.851


## Including Stop words + *IDF

In [21]:
docs_train, docs_test, y_train, y_test= train_test_3("/Users/anthonyvicario/movie_reviews")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [22]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.806
F1: 0.791


Decision Tree
Accuracy: 0.596
F1: 0.604


Logit (MaxEnt)
Accuracy: 0.862
F1: 0.868


## Removing Stop words + *IDF

In [24]:
docs_train, docs_test, y_train, y_test= train_test_3("/Users/anthonyvicario/movie_reviews")

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [25]:
results(docs_train, y_train, docs_test, y_test)

Naïve Bayes
Accuracy: 0.8
F1: 0.815


Decision Tree
Accuracy: 0.596
F1: 0.591


Logit (MaxEnt)
Accuracy: 0.872
F1: 0.866
