# Document Classification model training
- We are creating a small REST service that can make class predictions on provided documents.
- The service is hosted in Flask elsewhere, using a model we will train here.
- The data provided by Trellis is a collection of about 1000 sample documents with classes already provided.
- There is also a small number of documents outside the set of classes, so a response of "other" may be expected.

In [1]:
import os
import sys
import json
import re
import pandas as pd
import numpy as np
import spacy

print(sys.version)
print(pd.__version__)
print(np.__version__)

3.9.19 (main, Mar 21 2024, 12:08:14) 
[Clang 14.0.6 ]
1.4.3
1.21.5


In [2]:
# Load dataset provided by Trellis
# Structure as given: subdirectories of text files.  Each subdirectory name also serves as a class label
# There is a small collection of documents with the class 'other': the model should also make a separate prediction
# if a document doesn't readily fit into any of the given labels.  It's not part of the training data, just a very small
# sample for testing purposes, and obviously not representative of every other document out there.

data = []   # array of (text, label) tuples

root_path = "Data"
labels = os.listdir(root_path)
for label in labels:
    subdir_path = os.path.join(root_path, label)
    files = os.listdir(subdir_path)
    for f in files:
        full_path = os.path.join(subdir_path, f)
        with open( full_path, 'r', encoding='utf-8' ) as f_in:
            text = f_in.read()
        row = ( text, label )
        data.append(row)

# convert data to dataframe
df = pd.DataFrame(data, columns=['text', 'label'])
df.head()

Unnamed: 0,text,label
0,Games firms 'face tough future'\n\nUK video ga...,technologie
1,California sets fines for spyware\n\nThe maker...,technologie
2,T-Mobile bets on 'pocket office'\n\nT-Mobile h...,technologie
3,OnePlus 8 full specs comparison chart: 8 vs. 8...,technologie
4,'Friends fear' with lost mobiles\n\nPeople are...,technologie


In [3]:
# prep for NLP work
nlp = spacy.load('en_core_web_sm')

In [4]:
def clean_text(doc:str) -> str:
    """ Prep text for feature building
        remove numbers, punctuation
        remove stop words
        lemmatization
    """
    doc = re.sub(r'\d+', '', doc)  # remove numbers
    doc = re.sub(r'[^\s\w]', '', doc)   # keep only non-punctuation, whitespace
    doc = doc.replace('\n', '')  # some newlines accidentally escaped as a literal '\n'
    
    # let spaCy work on stop words, lemmatization
    doc_spacy = nlp(doc)

    non_stop = [token.lemma_ for token in doc_spacy if not token.is_stop]
    doc = ' '.join(non_stop)
    return doc

In [5]:
# Clean up text into a simple collection of words
df['words'] = df['text'].apply(clean_text)
df = df.drop(['text'], axis=1)
df.head()

Unnamed: 0,label,words
0,technologie,game firm face tough futureUK video game firm ...
1,technologie,California set fine spywarethe maker computer ...
2,technologie,tmobile bet pocket officetmobile launch late p...
3,technologie,oneplus spec comparison chart vs Pro vs ...
4,technologie,friend fear lose mobilespeople dependent mobil...


In [6]:
# prep for model building
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [7]:
# Build the training set.  Per instructions, documents marked 'other' are excluded for training,
# but we'll include them all in the test set
df_train, df_test = train_test_split(df[df['label'] != 'other'], test_size=0.2, random_state=20050723+20080710)

df_test = df_test.append( df[df['label'] == 'other'] )

X_train, y_train = df_train['words'], df_train['label']
X_test, y_test = df_test['words'], df_test['label']

df_other = df[df['label'] == 'other']
X_other, y_other = df_other['words'], df_other['label']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(800,) (800,) (206,) (206,)


  df_test = df_test.append( df[df['label'] == 'other'] )


In [8]:
# We will use a Naive Bayes classifier to make predictions.  The inputs to the model itself will
# come from applying TF-IDF on the data: identifying how relevant specific words are to the various
# document classes.  
# As part of cleaning up the input text we explicitly removed stop words.  TF-IDF would de-emphasize
# these words for the training data automatically, but removing them also creates a minor speed boost,
# perhaps important if we later trained with more data.

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

In [9]:
# first show performance without considering 'other' class
# We'll get warnings about not being able to compute F-1 score etc, since we're providing
# examples with the 'other' class but not having trained with it.
X_test_tfidf = tfidf_vectorizer.transform(X_test)

y_pred = nb_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9368932038834952
               precision    recall  f1-score   support

     business       1.00      0.91      0.95        22
entertainment       0.96      0.96      0.96        24
         food       1.00      1.00      1.00        14
     graphics       0.96      1.00      0.98        24
   historical       0.90      1.00      0.95        19
      medical       0.95      1.00      0.97        18
        other       0.00      0.00      0.00         6
     politics       0.71      1.00      0.83        15
        space       1.00      0.86      0.93        22
        sport       0.95      1.00      0.97        19
  technologie       0.96      0.96      0.96        23

     accuracy                           0.94       206
    macro avg       0.85      0.88      0.86       206
 weighted avg       0.92      0.94      0.92       206

[[20  0  0  0  0  0  0  2  0  0  0]
 [ 0 23  0  0  0  0  0  0  0  0  1]
 [ 0  0 14  0  0  0  0  0  0  0  0]
 [ 0  0  0 24  0  0  0  0  0  0  0]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# This model needs to account for when it can't confidently assign a class to a given document, so we can't
# just call MultinomialNB.predict(): it will always make a class prediction.  The softmax inside the prediction
# step will sum everything to 1, so there'll always be a maximum value for a specific class, but we assume that
# if it's a confident prediction the other probabilities will be *much* smaller.  Conversely, we believe that if
# it's not a confident prediction then all probabilities will be fairly close to each other: it's not strongly
# opinionated about any class.

# So, let's quantify a threshold for that.  For some of these 'other' samples, let's find the maximum delta between the
# maximum class probability and the average of all class probabilities.  The threshold for making a class prediction
# would be somewhere above that.

# In a more robust environment, It'd be nice to have access to more examples of unrecognized documents that we'd expect to see.

X_other_tfidf = tfidf_vectorizer.transform(X_other)
y_pred_proba = nb_classifier.predict_proba(X_other_tfidf)

max_mean_probas = []
for probas in y_pred_proba:
    max_mean_probas.append((np.max(probas), np.mean(probas)))

other_threshold = np.max(list(map(lambda x: x[0]-x[1], max_mean_probas)))
print(f"Maximum difference between max probability and mean of all: {other_threshold}")

Maximum difference between max probability and mean of all: 0.08609888182706918


In [11]:
# However, when running this model, we do manage to identify all the uncategorizable 'other'
# documents, but at a penalty: a handful of classes (entertainment, graphics, historical, space)
# have nontrivial drops in recall, as several of those documents get misclassified as 'other'.
# In the end, overall accuracy drops almost 2%, with perfect recall for the 'other' class, but much
# worse precision.

X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred_proba = nb_classifier.predict_proba(X_test_tfidf)
y_pred = []

for probas in y_pred_proba:
    if np.max(probas) - np.mean(probas) > other_threshold:
        max_label = max(zip(probas, nb_classifier.classes_))[1]
        y_pred.append(max_label)
    else:
        y_pred.append('other')

accuracy_other = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_other}')
print(f'Difference from previous model: {accuracy_other - accuracy:0.04f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9174757281553398
Difference from previous model: -0.0194
               precision    recall  f1-score   support

     business       1.00      0.91      0.95        22
entertainment       1.00      0.92      0.96        24
         food       1.00      1.00      1.00        14
     graphics       1.00      0.75      0.86        24
   historical       1.00      0.95      0.97        19
      medical       1.00      1.00      1.00        18
        other       0.32      1.00      0.48         6
     politics       0.83      1.00      0.91        15
        space       1.00      0.77      0.87        22
        sport       1.00      1.00      1.00        19
  technologie       0.96      0.96      0.96        23

     accuracy                           0.92       206
    macro avg       0.92      0.93      0.91       206
 weighted avg       0.96      0.92      0.93       206

[[20  0  0  0  0  0  0  2  0  0  0]
 [ 0 22  0  0  0  0  1  0  0  0  1]
 [ 0  0 14  0  0  0  0  0  0  0

In [12]:
# In advance of training a final model on the entire dataset, again excluding the 'other' documents,
# let's explore how well the dataset generalizes using cross-validation.  The standard deviation
# between the various folds is less than 0.02, which is pretty stable.

df_final = df[df['label'] != 'other']
X, y = df_final['words'], df_final['label']
X_tfidf = tfidf_vectorizer.transform(X)

nb_classifier = MultinomialNB()
cv_scores = cross_val_score(nb_classifier, X_tfidf, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print(f'Cross-Validation Accuracy Scores: {cv_scores}')
print(f'Mean CV Accuracy: {cv_scores.mean()}')
print(f'Standard Deviation of CV Accuracy: {cv_scores.std()}')



Cross-Validation Accuracy Scores: [0.935 0.955 0.93  0.98  0.97 ]
Mean CV Accuracy: 0.9540000000000001
Standard Deviation of CV Accuracy: 0.019339079605813683


In [13]:
# train a final model on the full dataset
# Future steps we'd want to explore:
# - Training on a larger dataset: 1000 examples across ten classes is a small training set for this kind of problem
# - Using a larger example set of 'other' documents to explore other ways to identify entities entirely out of the training set distribution
# - Testing with not just a train/test set, but a third validation set: after any other hyperparameters are adjusted (for example,
#   the threshold for 'other' class predictions), train a final model on all train/test, and check the score on validation.

nb_classifier = MultinomialNB()
nb_classifier.fit(X_tfidf, y)

In [14]:
# export trained model for use by the service
# We are going to use the other_threshold we computed before, however this value has been computed for the
# training set, 80% of this full set.  In practice we would want to either:
# - Keep this threshold associated with the model trained only on the training set, not this full dataset
# - Perform holdout studies to see how much this threshold changes on various training set sizes, if at all.
#   It would be nice to see if this threshold doesn't vary much as training data sizes grow.

import joblib
out_dir = "output"
model_file = "tfidf_model.pkl"
joblib.dump(nb_classifier, os.path.join(out_dir, model_file))

vectorizer_file = "tfidf_vectorizer.pkl"
joblib.dump(tfidf_vectorizer, os.path.join(out_dir, vectorizer_file))

configuration = {
    "model_file": model_file,
    "vectorizer_file": vectorizer_file,
    "other_threshold": other_threshold
}

config_file = "model_config.json"
with open(os.path.join(out_dir, config_file), 'w', encoding='utf-8') as f:
    print(json.dumps(configuration, indent=4), file=f)
    