## Evaluating classification models

- accuracy
- precision
- recall
- f1 score


In [1]:
from sklearn import metrics
import numpy as np
import pandas as pd
from collections import Counter

In [8]:
actual_labels = ['spam', 'ham', 'spam', 'spam', 'spam',
                 'ham', 'ham', 'spam', 'ham', 'spam',
                 'spam', 'ham', 'ham', 'ham', 'spam',
                 'ham', 'ham', 'spam', 'spam', 'ham']

predicted_labels = ['spam', 'spam', 'spam', 'ham', 'spam',
                    'spam', 'ham', 'ham', 'spam', 'spam',
                    'ham', 'ham', 'spam', 'ham', 'ham',
                    'ham', 'spam', 'ham', 'spam', 'spam']

In [9]:
ac = Counter(actual_labels)
pc = Counter(predicted_labels)

In [10]:
ac.most_common()

[('spam', 10), ('ham', 10)]

In [11]:
pc.most_common()

[('spam', 11), ('ham', 9)]

In [12]:
cm = metrics.confusion_matrix(y_true=actual_labels,
                              y_pred=predicted_labels,
                              labels=['spam', 'ham'])

In [14]:
pd.DataFrame(data=cm, 
             columns=pd.MultiIndex(levels=[['Predicted:'],
                                           ['spam', 'ham']],
                                   codes=[[0, 0], [0, 1]]),
             index=pd.MultiIndex(levels=[['Actual:'],
                                         ['spam', 'ham']],
                                 codes=[[0, 0], [0, 1]]))

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,spam,ham
Actual:,spam,5,5
Actual:,ham,6,4


In [19]:
positive_class = 'spam'
true_positive = 5.
false_positive = 6.
false_negative = 5.
true_negative = 4.

## Accuracy

The overall accuracy of proportion of correct predictions of the model, which can be depicted by the formula:
    
$Accuracy = \frac{TP + TN}{TP + FP + FN + TN}$

In [27]:
accuracy = np.round(metrics.accuracy_score(y_true=actual_labels,
                                           y_pred=predicted_labels), 2)
num = true_positive + true_negative
den = true_positive + true_negative + false_negative + false_positive
accuracy_manual = np.round(num/den, 2)
print(f'Accuracy: {accuracy}')
print(f'Manually computed accuracy: {accuracy_manual}')

Accuracy: 0.45
Manually computed accuracy: 0.45


## Precision

The number of predictions made that are actually correct or relevant out of all the predictions based on the positive class. This is also known as positive predictive value and can be depicted by the formula.

$Precision = \frac{TP}{TP + FP}$

In [28]:
precision = np.round(metrics.precision_score(y_true=actual_labels,
                                             y_pred=predicted_labels,
                                             pos_label=positive_class), 2)

num = true_positive
den = true_positive + false_positive
precision_manual = np.round(num/den, 2)
print(f'Precision: {precision}')
print(f'Precision Manual: {precision_manual}')

Precision: 0.45
Precision Manual: 0.45


## Recall

Recall is defined as the number of instances of positive class that were correctly predicted. This is also known as hit rate, coverage, sensitivity, and can be depicted by the formula:
    
$Recall = \frac{TP}{TP + FN}$

In [30]:
recall = np.round(metrics.recall_score(y_true=actual_labels,
                                       y_pred=predicted_labels,
                                       pos_label=positive_class), 2)
num = true_positive
den = true_positive + false_negative
recall_manual = np.round(num/den, 2)
print(f'Recall: {recall}')
print(f'Recall Manual: {recall_manual}')

Recall: 0.5
Recall Manual: 0.5


## F1 Score

Another accuracy measure that is computed by taking the harmonic mean of the precision and recall and can be represented as follows:
    
$F1 Score = \frac{2 x Precision x Recall}{Precision + Recall}$

In [31]:
f1_score = np.round(metrics.f1_score(y_true=actual_labels, 
                                     y_pred=predicted_labels,
                                     pos_label=positive_class), 2)
num = 2 * precision * recall 
den = precision + recall
f1_score_manual = np.round(num/den, 2)

print(f'F1 Score: {f1_score}')
print(f'Manually computed F1 Score: {f1_score_manual}')

F1 Score: 0.48
Manually computed F1 Score: 0.47


## Building a multi-class classification system

In [36]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

In [37]:
def get_data():
    data = fetch_20newsgroups(subset='all',
                              shuffle=True,
                              remove=('headers', 'footers', 'quotes'))
    return data

In [38]:
def prepare_datasets(corpus, labels, test_data_proportions=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(corpus, labels, test_size=0.33, random_state=42)
    return train_X, test_X, train_Y, test_Y

In [39]:
def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)

    return filtered_corpus, filtered_labels

In [40]:
# Get the data.
dataset = get_data()

# Print all the classes.
dataset.target_names

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [41]:
# Get corpus of documents and their corresponding labels.
corpus, labels = dataset.data, dataset.target
corpus, labels = remove_empty_docs(corpus, labels)

In [43]:
# See sample document and its label index, name.
print(f'Sample document: {corpus[10]}')
print(f'Class label: {labels[10]}')
print(f'Actual class label: {dataset.target_names[labels[10]]}')

Sample document: the blood of the lamb.

This will be a hard task, because most cultures used most animals
for blood sacrifices. It has to be something related to our current
post-modernism state. Hmm, what about used computers?

Cheers,
Kent
Class label: 19
Actual class label: talk.religion.misc


In [45]:
# Prepare and train datasets.
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(corpus, labels, 0.3)

In [48]:
from module.normalization import normalize_corpus

norm_train_corpus = normalize_corpus(train_corpus)
norm_test_corpus = normalize_corpus(test_corpus)

In [49]:
from module.feature_extractors import bow_extractor, tfidf_extractor
from module.feature_extractors import averaged_word_vectorizer
from module.feature_extractors import tfidf_weighted_averaged_word_vectorizer

import nltk
import gensim

In [None]:
# Bag of words features.
bow_vectorizer, bow_train_features = bow_extractor(norm_train_corpus)
bow_test_features = bow_vectorizer.transform(norm_test_corpus)

In [None]:
# Tfidf features.
tfidf_vectorizer, tfidf_train_features = tfidf_extractor(norm_train_corpus)
tfidf_test_features = tfidf_vectorizer.transform(norm_test_corpus)

In [None]:
# Tokenized documents.
tokenized_train = [nltk.word_tokenize(text)
                   for text in norm_train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                  for text in norm_test_corpus]

In [None]:
# Build word2vec model.
model = gensim.models.Word2Vec(tokenized_train,
                               size=500,
                               window=100,
                               min_count=30,
                               sample=1e-3)

# Averaged word vector features.
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=model,
                                                 num_features=500)
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=model,
                                                num_features=500)

In [67]:
# Tfidf weighted average word vector features.
vocab = tfidf_vectorizer.vocabulary_
tfidf_wv_train_features = tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_train,
                                                                  tfidf_vectors=tfidf_train_features,
                                                                  tfidf_vocabulary=vocab,
                                                                  model=model,
                                                                  num_features=500)
tfidf_wv_test_features = tfidf_weighted_averaged_word_vectorizerraged_word_vectorizereraged_word_vectorizererage_word_vectorizer(corpus=tokenized_test,
                                                                tfidf_vectors=tfidf_test_features,
                                                                tfidf_vocabulary=vocab,
                                                                model=model,
                                                                num_features=500)

NameError: name 'tfidf_weighted_averaged_word_vectorizerraged_word_vectorizereraged_word_vectorizererage_word_vectorizer' is not defined

In [54]:
from sklearn import metrics
import numpy as np

def get_metrics(true_labels, predicted_labels):
    print('Accuracy:', np.round(metrics.accuracy_score(true_labels, predicted_labels), 2))
    print('Precision:', np.round(metrics.precision_score(true_labels, predicted_labels, average='weighted'), 2))
    print('Recall:', np.round(metrics.recall_score(true_labels, predicted_labels, average='weighted'), 2))
    print('F1 Score:', np.round(metrics.f1_score(true_labels, predicted_labels, average='weighted'), 2))

In [55]:
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels,
                                 test_features, test_labels):
    # Build model.
    classifier.fit(train_features, train_labels)
    
    # Predict using model.
    predictions = classifier.predict(test_features)
    
    # Evaluate model prediction performance.
    get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)
    return predictions

In [57]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', max_iter=100)

In [58]:
# Multinomial Niave Bayes with bag of word features.
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                                   train_features=bow_train_features,
                                                   train_labels=train_labels,
                                                   test_features=bow_test_features,
                                                   test_labels=test_labels)

Accuracy: 0.67
Precision: 0.72
Recall: 0.67
F1 Score: 0.65


In [60]:
# Support Vector Machines with bag of word features.
svm_bow_predictions = train_predict_evaluate_model(classifier=svm, 
                                                   train_features=bow_train_features,
                                                   train_labels=train_labels,
                                                   test_features=bow_test_features,
                                                   test_labels=test_labels)

Accuracy: 0.63
Precision: 0.68
Recall: 0.63
F1 Score: 0.64


In [62]:
# Multinomial Naive Bayes with tfidf features.
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                                     train_features=tfidf_train_features,
                                                     train_labels=train_labels,
                                                     test_features=tfidf_test_features,
                                                     test_labels=test_labels)

Accuracy: 0.72
Precision: 0.78
Recall: 0.72
F1 Score: 0.7


In [64]:
# Support Vector Machines with tfidf features.
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                                     train_features=tfidf_train_features,
                                                     train_labels=train_labels,
                                                     test_features=tfidf_test_features,
                                                     test_labels=test_labels)

Accuracy: 0.77
Precision: 0.77
Recall: 0.77
F1 Score: 0.76


In [65]:
# Support Vector Machines with averaged word vector features.
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                                     train_features=avg_wv_train_features,
                                                     train_labels=train_labels,
                                                     test_features=avg_wv_test_features,
                                                     test_labels=test_labels)

Accuracy: 0.53
Precision: 0.55
Recall: 0.53
F1 Score: 0.51


In [None]:
# Support Vector Machines with tfidf averaged word vector features.
svm_avgwv_predictions = train_predict_evaluate_model(classifier=svm,
                                                     train_features=tfidf_wv_train_features,
                                                     train_labels=train_labels,
                                                     test_features=tfidf_wv_test_features,
                                                     test_labels=test_labels)

In [68]:
import pandas as pd
cm = metrics.confusion_matrix(test_labels, svm_tfidf_predictions)
pd.DataFrame(cm, index=range(0, 20), columns=range(0, 20))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,157,3,0,1,1,0,2,3,4,1,5,4,1,4,5,35,3,7,7,20
1,1,225,9,7,8,14,8,1,2,1,0,2,5,2,6,1,3,0,2,0
2,1,21,220,18,10,18,7,1,0,0,0,2,7,1,2,2,1,1,2,0
3,1,12,25,220,11,4,8,3,1,1,1,2,6,2,1,0,1,1,0,0
4,1,4,6,14,229,6,4,2,3,1,0,3,9,3,4,1,1,0,1,0
5,0,24,18,1,1,269,0,1,1,0,0,0,4,3,1,1,0,1,0,0
6,0,2,7,11,12,2,268,11,4,2,1,1,10,1,3,0,2,1,1,0
7,1,5,2,2,1,3,4,247,20,1,2,2,10,4,2,0,3,3,4,0
8,4,1,0,4,2,2,4,25,255,4,5,2,1,4,1,3,1,1,3,0
9,1,1,1,0,2,2,5,3,6,279,11,2,1,1,2,4,2,0,1,1


In [70]:
class_names = dataset.target_names
print(class_names[0], '->', class_names[19])
print(class_names[18], '->', class_names[16])
print(class_names[19], '->', class_names[15])

alt.atheism -> talk.religion.misc
talk.politics.misc -> talk.politics.guns
talk.religion.misc -> soc.religion.christian


In [72]:
import re
num = 0
for document, label, predicted_label in zip(test_corpus, test_labels, svm_tfidf_predictions):
    if label == 0 and predicted_label == 19:
        print('Actual label:', class_names[label])
        print('Predicted label:', class_names[predicted_label])
        print('Document:-')
        print(re.sub('\n', ' ', document))
        print()
        num += 1
        if num == 4:
            break

Actual label: alt.atheism
Predicted label: talk.religion.misc
Document:-
 Yup, I think you're right.  My mistake.  Now, how do I make an "R" backwards using a computer keyboard?  I'll bet the gods know how (this is alt.atheism, after all).  Tell you what, if all my "R"s start coming out backwards when I type from now on, I'll become a believer.  (And that's not asking for miracles.  If I asked for a miracle, I'd ask for a real miracle, like for Pat Buchanan to become an out-of-the-closet drag queen - well...maybe that wouldn't be so miraculous, but I think he'd look fabulous in a feather boa and a sequined hat like Mia Farrow wore in Gatsby.)

Actual label: alt.atheism
Predicted label: talk.religion.misc
Document:-
 Hehehe, so you say, but this objective morality somehere tells you  that this is not the case, and you don't know all the rules of such transcendental game systems...  Cheers, Kent

Actual label: alt.atheism
Predicted label: talk.religion.misc
Document:-
-*----   I believe 