## Import Libraries & Download Data

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import movie_reviews,stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
import string
from nltk import pos_tag
from nltk.stem import wordnet
import re
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import cross_val_score
from nltk.corpus import wordnet
from sklearn.ensemble import RandomForestClassifier



In [2]:
nltk.download('movie_reviews')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\carte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Parse Data

In [3]:
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [4]:
documents[1]

('the happy bastard\'s quick movie review \ndamn that y2k bug . \nit\'s got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on . \nlittle do they know the power within . . . \ngoing for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance . \nwe don\'t know why the crew was really out in the middle of nowhere , we don\'t know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don\'t know why donald sutherland is stumbling around drunkenly throughout . \nhere , it\'s just " hey , let\'s chase these people around with some robots " . \nthe acting is below average , even from the likes of curtis . \nyou\'re more likely to get a kick out of 

In [5]:
X, y = zip(*documents)

## Data Cleaning

In [6]:
documents = []
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for doc in range(0, len(X)):
    #basic cleaning
    document = re.sub(r'\W', ' ', str(X[doc]))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'^b\s+', '', document)
    document = document.lower()

    #lemmatization by splitting then joining
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

In [7]:
documents[1]

'the happy bastard quick movie review damn that y2k bug it got head start in this movie starring jamie lee curtis and another baldwin brother william this time in story regarding crew of tugboat that come across deserted russian tech ship that ha strangeness to it when they kick the power back on little do they know the power within going for the gore and bringing on few action sequence here and there virus still feel very empty like movie going for all flash and no substance we don know why the crew wa really out in the middle of nowhere we don know the origin of what took over the ship just that big pink flashy thing hit the mir and of course we don know why donald sutherland is stumbling around drunkenly throughout here it just hey let chase these people around with some robot the acting is below average even from the like of curtis you re more likely to get kick out of her work in halloween h20 sutherland is wasted and baldwin well he acting like baldwin of course the real star her

## Feature Selection

Create features from text:

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfconverter = TfidfVectorizer(max_features=3000, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()

Split into training and testing data:

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Create Model

Define model function to allow for testing multiple models, while outputting evaluation results:

In [11]:
def train_model(classifier, X_train = X_train, y_train= y_train, X_test = X_test, y_test = y_test):
    # fit the training dataset on the classifier
    classifier.fit(X_train, y_train)
    # predict the labels on validation dataset
    y_pred = classifier.predict(X_test)
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))

Create Naive Bayes Model:

In [12]:
train_model(naive_bayes.MultinomialNB(alpha=1))

[[83 16]
 [24 77]]
              precision    recall  f1-score   support

         neg       0.78      0.84      0.81        99
         pos       0.83      0.76      0.79       101

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200

0.8


Comparison Models:

In [13]:
train_model(linear_model.LogisticRegression())

[[77 22]
 [13 88]]
              precision    recall  f1-score   support

         neg       0.86      0.78      0.81        99
         pos       0.80      0.87      0.83       101

    accuracy                           0.82       200
   macro avg       0.83      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200

0.825


In [14]:
train_model(svm.SVC())

[[78 21]
 [13 88]]
              precision    recall  f1-score   support

         neg       0.86      0.79      0.82        99
         pos       0.81      0.87      0.84       101

    accuracy                           0.83       200
   macro avg       0.83      0.83      0.83       200
weighted avg       0.83      0.83      0.83       200

0.83
