# Text Classification - Model Training

**Author:** Airc Miao
**Date:** 2024-02-17

In [1]:
#!pip install pyarrow
#!python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import re
import tensorflow as tf
from spacy.util import minibatch, compounding
import spacy
from spacy.matcher import Matcher
import multiprocessing
from pandarallel import pandarallel

from sklearn.model_selection import train_test_split

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

seed_value = 42

## 1. Data Cleaning

In [3]:
# Load csv data
df = pd.read_csv('data/df_new.csv')
df.head()

Unnamed: 0,complaint_id,date_received,district_occurrence,general_cap_classification,summary,incident_date_extract,incident_time_extract,objectid,label
0,22-0529,2022-11-09 05:00:00+00,1200.0,CIVIL RIGHTS COMPLAINT,"According to the complainant, on 11-6-22 at 8:...",11/06/2022,8:40 PM,7015,low
1,22-0530,2022-11-10 05:00:00+00,2400.0,UNPROFESSIONAL CONDUCT,"According to the complainant, on 11-6-22 at 8:...",11/06/2022,8:30 PM,7016,medium
2,22-0531,2022-11-10 05:00:00+00,3900.0,DEPARTMENTAL VIOLATIONS,"According to the complainant, on 11-2-22 at 9:...",11/02/2022,9:57 PM,7017,medium
3,22-0532,2022-11-09 05:00:00+00,2200.0,HARASSMENT,"According to the complainant, on 10-21-22, the...",10/21/2022,,7018,high
4,22-0533,2022-11-10 05:00:00+00,2200.0,LACK OF SERVICE,"According to the complainant, on 10-30-22, whi...",10/30/2022,,7019,low


Only keep the columns we need: general_cap_classification, summary, label

In [4]:
df_cleaned = df[['general_cap_classification', 'summary', 'label']]
df_cleaned.head()

Unnamed: 0,general_cap_classification,summary,label
0,CIVIL RIGHTS COMPLAINT,"According to the complainant, on 11-6-22 at 8:...",low
1,UNPROFESSIONAL CONDUCT,"According to the complainant, on 11-6-22 at 8:...",medium
2,DEPARTMENTAL VIOLATIONS,"According to the complainant, on 11-2-22 at 9:...",medium
3,HARASSMENT,"According to the complainant, on 10-21-22, the...",high
4,LACK OF SERVICE,"According to the complainant, on 10-30-22, whi...",low


In [5]:
df_cleaned = df_cleaned.map(lambda x: x.strip() if isinstance(x, str) else x)

In [6]:
df_cleaned[['general_cap_classification']].value_counts()

general_cap_classification
DEPARTMENTAL VIOLATIONS       1215
LACK OF SERVICE               1063
PHYSICAL ABUSE                 654
VERBAL ABUSE                   375
UNPROFESSIONAL CONDUCT         366
HARASSMENT                     182
CIVIL RIGHTS COMPLAINT         166
CRIMINAL ALLEGATION            161
NON-INVESTIGATORY INCIDENT     110
DOMESTIC                        52
FALSIFICATION                   29
SEXUAL CRIME/MISCONDUCT         19
DRUGS                            4
FORCE-TASER/CED/ECW              1
OTHER MISCONDUCT                 1
USE OF FORCE INTERNAL            1
Name: count, dtype: int64

We can see this column data is not balanced. Some of them only have 1 record.

In [7]:
df_cleaned[['label']].value_counts()

label 
medium    1859
low       1441
high      1159
Name: count, dtype: int64

In [8]:
df_cleaned.isnull().sum()

general_cap_classification     60
summary                       195
label                           0
dtype: int64

Drop any row with empty value

In [9]:
df_cleaned = df_cleaned.dropna()
df_cleaned.isnull().sum()

general_cap_classification    0
summary                       0
label                         0
dtype: int64

In [10]:
# Combine the general_cap_classification and summary into one column
df_cleaned['text'] = df_cleaned['general_cap_classification'] + '. ' + df_cleaned['summary']

We will clean the following words from the summary column:
- "According to the complainant,"
- Dates
- Time
- Locations

In [11]:
import nltk
from nltk.corpus import stopwords

def cleanAndTokenizeText(txt):
    # Remove "According to the complainant,"
    txt = txt.replace('According to the complainant,', '')

    # Remove dates
    date_pattern= r'(?:\,\s)*(?:on\s)*\d+-\d+-\d+[\s]*[\,]*'
    txt = re.sub(date_pattern, '', txt, flags=re.IGNORECASE)

    # Remove time
    time_pattern= r'(at)?\s?\d+:\d+\s?(AM|PM)?\,?\s?'
    txt = re.sub(time_pattern, '', txt, flags=re.IGNORECASE)

    # Remove locations of patter of (the\s)?\d+\w+\s?District
    location_pattern= r'(the\s)?\d+\w+\s?District'
    txt = re.sub(location_pattern, '', txt, flags=re.IGNORECASE)

    # Remove "While in the confines of ,"
    txt = txt.replace("While in the confines of ,", '')

    words = nltk.tokenize.word_tokenize(txt)
    
    # make all lower case
    words = [word.lower() for word in words]

    #stopwords
    stopwords = set(nltk.corpus.stopwords.words('english'))
    
    # Remove stopwords
    words = [word for word in words if word.lower() not in stopwords]
    
    # Remove punctuation
    words = [word for word in words if word.isalnum()]
    
    # Stemming
    stemmer = nltk.stem.PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    # Remove numbers
    words = [word for word in words if not word.isdigit()]
    
    # remove only one-letter words
    words = [word for word in words if len(word) > 1]
    
    # return text of words
    return ' '.join(words)

In [12]:
df_cleaned['text']  = df_cleaned['text'] .parallel_apply(cleanAndTokenizeText)
df_cleaned = df_cleaned.dropna()

df_cleaned.head(5)

Unnamed: 0,general_cap_classification,summary,label,text
0,CIVIL RIGHTS COMPLAINT,"According to the complainant, on 11-6-22 at 8:...",low,civil right complaint confin stop polic tag co...
1,UNPROFESSIONAL CONDUCT,"According to the complainant, on 11-6-22 at 8:...",medium,unprofession conduct confin treat unprofession...
2,DEPARTMENTAL VIOLATIONS,"According to the complainant, on 11-2-22 at 9:...",medium,department violat confin department polici vio...
3,HARASSMENT,"According to the complainant, on 10-21-22, the...",high,harass harass offic offic threaten catch face ...
4,LACK OF SERVICE,"According to the complainant, on 10-30-22, whi...",low,lack servic confin receiv proper servic offic ...


In [13]:
# Total number of rows
df_cleaned.shape

(4234, 4)

# 2. Train and Test Split

Split the data into training and testing data, label as y and text as X

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_cleaned['text'], df_cleaned['label'], test_size=0.2, random_state=seed_value, stratify=df_cleaned['label'])

# print the shape of the train and test data
print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')

Training data shape: (3387,)
Testing data shape: (847,)


In [15]:
X_train.head()

3687    lack servic lack servic complaint receiv unkno...
2627    department violat receiv proper servic departm...
355     department violat confin famili member stop po...
729                           harass confin intimid offic
4047    unprofession conduct treat rude manner receiv ...
Name: text, dtype: object

In [16]:
y_train.head()


3687       low
2627    medium
355       high
729       high
4047    medium
Name: label, dtype: object

# 3. Text Preprocessing

## 3.1. CountVectorizer

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
cv_obj = CountVectorizer(ngram_range=(1, 3))

# Fit and transform the training data
X_train_cv = cv_obj.fit_transform(X_train)

# Transform the testing data
X_test_cv = cv_obj.transform(X_test)


In [18]:
X_train_cv.shape

(3387, 57605)

## 3.2. TfidfVectorizer

In [19]:
# Create a TfidfVectorizer object
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_obj = TfidfVectorizer(ngram_range=(1, 3))

# Fit and transform the training data
X_train_tfidf = tfidf_obj.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_obj.transform(X_test)

In [20]:
X_train_tfidf.shape

(3387, 57605)

# 4. Model Training

In [24]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


def get_report(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return accuracy_score(y_test, y_pred), confusion_matrix(y_test, y_pred), classification_report(y_test, y_pred, zero_division=1)


def get_top_bom_features(model, vectorizer_obj, n=10):
    if not hasattr(model, 'coef_'):
        cv_feature_w_freq = pd.DataFrame({
            'feature': vectorizer_obj.get_feature_names_out(),
            'coefficient': model.feature_log_prob_[0]})
    else:
        cv_feature_w_freq = pd.DataFrame({
            'feature': vectorizer_obj.get_feature_names_out(),
            'coefficient': model.coef_[0]})

    cv_feature_w_freq.sort_values(by='coefficient', ascending=True, inplace=True)

    return cv_feature_w_freq.head(10), cv_feature_w_freq.tail(10)

## 4.1. Logistic Regression with CountVectorizer

In [26]:
from sklearn.linear_model import LogisticRegression

# Create a LogisticRegression object
model_lr_cv = LogisticRegression(random_state=seed_value, max_iter=20000)

# Get the accuracy score, confusion matrix, and classification report
lr_cv_accuracy_score, lr_cv_confusion_matrix, lr_cv_classification_report = get_report(model_lr_cv, X_train_cv, y_train, X_test_cv, y_test)

# Print the accuracy score
print(f'Logistic Regression CountVectorizer Accuracy Score: {lr_cv_accuracy_score}')

# Print the confusion matrix
print(f'Logistic Regression CountVectorizer Confusion Matrix: \n{lr_cv_confusion_matrix}')

# Print the classification report
print(f'Logistic Regression CountVectorizer Classification Report: \n{lr_cv_classification_report}')


Logistic Regression CountVectorizer Accuracy Score: 0.974025974025974
Logistic Regression CountVectorizer Confusion Matrix: 
[[212  10  10]
 [  1 241   1]
 [  0   0 372]]
Logistic Regression CountVectorizer Classification Report: 
              precision    recall  f1-score   support

        high       1.00      0.91      0.95       232
         low       0.96      0.99      0.98       243
      medium       0.97      1.00      0.99       372

    accuracy                           0.97       847
   macro avg       0.98      0.97      0.97       847
weighted avg       0.97      0.97      0.97       847


## 4.2. Logistic Regression with TfidfVectorizer

In [27]:
# Create a LogisticRegression object
model_lr_tfidf = LogisticRegression(random_state=seed_value, max_iter=20000)

# Get the accuracy score, confusion matrix, and classification report
lr_tfidf_accuracy_score, lr_tfidf_confusion_matrix, lr_tfidf_classification_report = get_report(model_lr_tfidf, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Print the accuracy score
print(f'Logistic Regression TfidfVectorizer Accuracy Score: {lr_tfidf_accuracy_score}')

# Print the confusion matrix
print(f'Logistic Regression TfidfVectorizer Confusion Matrix: \n{lr_tfidf_confusion_matrix}')

# Print the classification report
print(f'Logistic Regression TfidfVectorizer Classification Report: \n{lr_tfidf_classification_report}')

Logistic Regression TfidfVectorizer Accuracy Score: 0.9598583234946871
Logistic Regression TfidfVectorizer Confusion Matrix: 
[[202  12  18]
 [  1 239   3]
 [  0   0 372]]
Logistic Regression TfidfVectorizer Classification Report: 
              precision    recall  f1-score   support

        high       1.00      0.87      0.93       232
         low       0.95      0.98      0.97       243
      medium       0.95      1.00      0.97       372

    accuracy                           0.96       847
   macro avg       0.96      0.95      0.96       847
weighted avg       0.96      0.96      0.96       847


## 4.3 Naive Bayes with CountVectorizer

In [28]:
from sklearn.naive_bayes import MultinomialNB

# Create a MultinomialNB object
model_nb_cv = MultinomialNB()

# Get the accuracy score, confusion matrix, and classification report
nb_cv_accuracy_score, nb_cv_confusion_matrix, nb_cv_classification_report = get_report(model_nb_cv, X_train_cv, y_train, X_test_cv, y_test)

# Print the accuracy score
print(f'Naive Bayes CountVectorizer Accuracy Score: {nb_cv_accuracy_score}')

# Print the confusion matrix
print(f'Naive Bayes CountVectorizer Confusion Matrix: \n{nb_cv_confusion_matrix}')

# Print the classification report
print(f'Naive Bayes CountVectorizer Classification Report: \n{nb_cv_classification_report}')



Naive Bayes CountVectorizer Accuracy Score: 0.9185360094451004
Naive Bayes CountVectorizer Confusion Matrix: 
[[187  16  29]
 [  4 226  13]
 [  3   4 365]]
Naive Bayes CountVectorizer Classification Report: 
              precision    recall  f1-score   support

        high       0.96      0.81      0.88       232
         low       0.92      0.93      0.92       243
      medium       0.90      0.98      0.94       372

    accuracy                           0.92       847
   macro avg       0.93      0.91      0.91       847
weighted avg       0.92      0.92      0.92       847


## 4.4 Naive Bayes with TfidfVectorizer

In [29]:
# Create a MultinomialNB object
model_nb_tfidf = MultinomialNB()

# Get the accuracy score, confusion matrix, and classification report
nb_tfidf_accuracy_score, nb_tfidf_confusion_matrix, nb_tfidf_classification_report = get_report(model_nb_tfidf, X_train_tfidf, y_train, X_test_tfidf, y_test)

# Print the accuracy score
print(f'Naive Bayes TfidfVectorizer Accuracy Score: {nb_tfidf_accuracy_score}')

# Print the confusion matrix
print(f'Naive Bayes TfidfVectorizer Confusion Matrix: \n{nb_tfidf_confusion_matrix}')

# Print the classification report
print(f'Naive Bayes TfidfVectorizer Classification Report: \n{nb_tfidf_classification_report}')


Naive Bayes TfidfVectorizer Accuracy Score: 0.8689492325855962
Naive Bayes TfidfVectorizer Confusion Matrix: 
[[149  13  70]
 [  0 216  27]
 [  1   0 371]]
Naive Bayes TfidfVectorizer Classification Report: 
              precision    recall  f1-score   support

        high       0.99      0.64      0.78       232
         low       0.94      0.89      0.92       243
      medium       0.79      1.00      0.88       372

    accuracy                           0.87       847
   macro avg       0.91      0.84      0.86       847
weighted avg       0.89      0.87      0.86       847


# 5. Model Evaluation

We will choose Logistic Regression with CountVectorizer as the best model. 

A deep investigation of the model will be conducted to understand the top features and the worst features.

An **accuracy** score of **0.97** means 97% of cases classified correctly by your model.

An average **precision** is 0.98 means 98% of the positive cases are correctly classified.

An average **recall** is 0.97 means out of all the actual positive instances in your dataset, the classifier correctly identifies 97% of them on average across all classes.

In [30]:
def get_top_bom_features(model, vectorizer_obj, n=10):
    if not hasattr(model, 'coef_'):
        cv_feature_w_freq = pd.DataFrame({
            'feature': vectorizer_obj.get_feature_names_out(),
            'coefficient': model.feature_log_prob_[0]})
    else:
        cv_feature_w_freq = pd.DataFrame({
            'feature': vectorizer_obj.get_feature_names_out(),
            'coefficient': model.coef_[0]})

    cv_feature_w_freq.sort_values(by='coefficient', ascending=True, inplace=True)

    return cv_feature_w_freq.head(10), cv_feature_w_freq.tail(10)

In [32]:
# Get the top and bottom features
lr_cv_top_features, lr_cv_bottom_features = get_top_bom_features(model_lr_cv, cv_obj)

lr_cv_top_features

Unnamed: 0,feature,coefficient
14437,department violat,-1.160194
23643,incid,-1.097154
18361,falsif,-0.913208
54578,verbal,-0.752779
14386,department,-0.738023
54579,verbal abus,-0.700062
53407,unprofession conduct,-0.693119
25960,lack,-0.648124
25965,lack servic,-0.644478
43793,servic,-0.551012


In [33]:
lr_cv_bottom_features

Unnamed: 0,feature,coefficient
16727,drug,0.504843
1492,alleg,0.768128
16073,domest,0.924407
13545,crimin,0.951499
44360,sexual,1.031545
13552,crimin alleg,1.076995
36194,physic abus,1.107055
36193,physic,1.297644
21707,harass,1.424085
21273,gun,1.55144


# 6. Serialize the Model

In [34]:
# Serialize the model

import joblib

joblib.dump(model_lr_cv, 'model_lr_cv.pkl')


['model_lr_cv.pkl']

In [35]:
# Serialize the vectorizer cv_obj
joblib.dump(cv_obj, 'vectorizer.pkl')

['vectorizer.pkl']