# Bag of Words Baseline Classifier

To create a simple baseline for measurement of the neural accuracy, I am using a basic bag-of-words classification based on `classifier.py` in the tutotrial found at https://github.com/CharlesRajendran/TextClassification.

The approach uses lower case words only to match the uncased versions we are using, removes stop words and converts words to base lemmas.

In [1]:
## Usual Imports
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize as wt 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

import sys
sys.path.insert(0, '../python')
import debug
from jbyrne_utils import load_data

In [2]:
dataset = load_data("../data/3xNCS.json")

nltk.download('punkt')
nltk.download('stopwords')

Loaded 11056 data records.


[nltk_data] Downloading package punkt to /home/james/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/james/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
dataset


array([{'sentence_id': 8119, 'label': 0, 'text': 'Therefore, I think the question before the American people is: Are we doing as much as we can do?'},
       {'sentence_id': 3595, 'label': 0, 'text': "The worst thing we could do in this economic climate is to raise people's taxes."},
       {'sentence_id': 23764, 'label': 0, 'text': 'And I ask you for the privilege of leading our nation to be stronger at home and respected again in the world.'},
       ...,
       {'sentence_id': 21456, 'label': 0, 'text': 'I think that has to be a part of a candidate for president or being president.'},
       {'sentence_id': 13844, 'label': 1, 'text': 'When I was asked by our government to do the POW project, within a year the Vietnamese had sent people into Canada to make arrangements to have me and my family killed.'},
       {'sentence_id': 27495, 'label': 0, 'text': 'But over time, what they can do is regain credibility.'}],
      dtype=object)

In [4]:
stemmer = PorterStemmer()

data = []


for i in range(len(dataset)):
    text = dataset[i]["text"]

    # remove non alphabatic characters
    text = re.sub('[^A-Za-z]', ' ', text)

    # make words lowercase, because Go and go will be considered as two words
    text = text.lower()

    # tokenising
    tokenized_text = wt(text)

    # remove stop words and stemming
 
    text_processed = []
    for word in tokenized_text:
        if word not in set(stopwords.words('english')):
            text_processed.append(stemmer.stem(word))

    new_text = " ".join(text_processed)
    data.append(new_text)

In [5]:
# creating the feature matrix 

matrix = CountVectorizer(max_features=1000)
X = matrix.fit_transform(data).toarray()
y = [ i["label"] for i in dataset]

# split train and test data
train_len = int(0.8 * len(dataset))
val_len = int(0.2 * len(dataset))

X_train = X[:train_len]
X_test  = X[train_len:]
y_train = y[:train_len]
y_test  = y[train_len:]

# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# predict class
y_pred = classifier.predict(X_test)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [6]:
accuracy

0.6980108499095841