In [1]:
import os
import re
import math
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import PlaintextCorpusReader

import warnings
warnings.filterwarnings("ignore")

#### Define functions

In [2]:
# find out of the document is a Berardinelli document (name has 4 characters); 
# -> returns the author name
def is_bern(doc_name: str) -> str: 
    match = re.search('^\d{4}\.txt$', doc_name)
    return 'Berardinelli' if match is not None else 'Schwartz'

#### Define system constants

In [3]:
# working directory
pwd: str = os.environ['HOME'] + '/work/assignment/assignment-7'
corpus_root: str = pwd + '/MovieReviews'

#### Read all the files

In [4]:
files: PlaintextCorpusReader = PlaintextCorpusReader(corpus_root, '.*', encoding='latin-1')
print('Count of files:', len(files.fileids()))

file_contents: list = [files.raw(file) for file in files.fileids()]
print('File contents:', len(file_contents))

authors: list = [is_bern(fileId) for fileId in files.fileids()]
print('Count of Author names:', len(authors))

Count of files: 180
File contents: 180
Count of Author names: 180


#### Vectorize the corpus
- This is to vectorize the text corpus. After these codes, the X object will be the input vector for machine learning models.
- When transform into vectors, we do NOT use the raw count of a word in a document. Instead, we use the word's tf-idf score in a document.
- max_df=0.5 means ignoring words that appear in more than 50% of the documents; min_df=2 means ignoring words that appear in less than 2 documents.

In [5]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)

X = vectorizer.fit_transform(file_contents)
print('Original dimension (before dim reduction):', X.shape)

Original dimension (before dim reduction): (180, 6704)


#### Vectorize the corpus
- Use SVD (Singular Value Decomposition), a common matrix decomposition technique to reduce the dimensionality.
- We have to re-normalize after we run our SVD on the dataset.

In [6]:
n_components = 3
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X_reduced = lsa.fit_transform(X)
print('Reduced dimension (after dim reduction):', X_reduced.shape)

Reduced dimension (after dim reduction): (180, 3)


#### Select training data set (random 80% data)

In [7]:
training_size: int = math.ceil(len(X_reduced) * 0.8)

training_idx = np.random.choice(X_reduced.shape[0], size=training_size, replace=False)
X_training = X_reduced[training_idx, :]
print('Training data set:', X_training.shape)

Training data set: (144, 3)


#### Select test data set (remaining 20% data)

In [8]:
test_idx = list(set(range(X_reduced.shape[0])) - set(training_idx))
X_test = X_reduced[test_idx, :]
print('Test data set:', X_test.shape)

Test data set: (36, 3)


#### Split the label into training & test set

In [9]:
labels_training = [authors[i] for i in training_idx]
labels_test = [authors[j] for j in test_idx]
print('Label test data set:', labels_test)

Label test data set: ['Schwartz', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Schwartz', 'Schwartz', 'Schwartz', 'Berardinelli', 'Schwartz', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Schwartz', 'Berardinelli', 'Schwartz', 'Berardinelli', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli']


#### Use Naive Bayes Classifier
- Create the classifier
- Fit the classifier with training data & labels
- Predict the labels for the test data

In [10]:
gnb = GaussianNB()
gnb.fit(X_training, labels_training)
gnb.predict(X_test)

array(['Schwartz', 'Berardinelli', 'Berardinelli', 'Berardinelli',
       'Berardinelli', 'Schwartz', 'Schwartz', 'Schwartz', 'Berardinelli',
       'Schwartz', 'Berardinelli', 'Berardinelli', 'Berardinelli',
       'Berardinelli', 'Berardinelli', 'Schwartz', 'Berardinelli',
       'Schwartz', 'Berardinelli', 'Schwartz', 'Schwartz', 'Schwartz',
       'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz', 'Schwartz',
       'Schwartz', 'Schwartz', 'Schwartz', 'Berardinelli', 'Berardinelli',
       'Berardinelli', 'Berardinelli', 'Berardinelli', 'Berardinelli'],
      dtype='<U12')

#### Prediction metrics
- Calculate Confusion Matrix
- Calculate Accuracy Score

In [11]:
print('Confusion Matrix:', confusion_matrix(gnb.predict(X_test),labels_test))

print('Accuracy:', accuracy_score(gnb.predict(X_test),labels_test))


Confusion Matrix: [[18  0]
 [ 0 18]]
Accuracy: 1.0
