# Email Classification

This example shows the classification of email using deep neural networks after generating tf-idf.

We will use deep neural networks to classify email into one of the 20 pre-trained categories based on the words present in each email.

In [4]:
!pip3 install sklearn



In [19]:
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

x_train, y_train = newsgroups_train.data, newsgroups_train.target
x_test, y_test = newsgroups_test.data, newsgroups_test.target

In [20]:
print('List of all 20 categories:')
newsgroups_train.target_names

List of all 20 categories:


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [22]:
print("Sample email:")
x_train[0]

Sample email:


"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [23]:
print("Sample target category:")
y_train[0]

Sample target category:


7

In [24]:
newsgroups_train.target_names[y_train[0]]

'rec.autos'

In [26]:
# Preprocessing
!pip3 install pandas
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

Collecting pandas
  Downloading pandas-1.1.0-cp38-cp38-macosx_10_9_x86_64.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 3.7 MB/s eta 0:00:01    |████▎                           | 1.4 MB 3.7 MB/s eta 0:00:03
Collecting pytz>=2017.2
  Downloading pytz-2020.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 25.0 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.0 pytz-2020.1


In [39]:
stopwds = set(stopwords.words('english'))

def preprocessing(text):
    text = text.lower()
    text = ' '.join(''.join([' ' if ch in string.punctuation else ch for ch in text]).split())
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens = [word for word in tokens if len(word) >= 3]
    tokens = [token for token in tokens if token not in stopwds]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    tagged_corpus = pos_tag(tokens)
    noun_tags = ['NN', 'NNP', 'NNPS', 'NNS']
    verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

    lemmatizer = WordNetLemmatizer()
    
    def prat_lemmatize(token, tag):
        """resolve mismatch between pos_tag function and intake value of lemmatize function."""
        if tag in noun_tags:
            return lemmatizer.lemmatize(token, 'n')
        elif tag in verb_tags:
            return lemmatizer.lemmatize(token, 'v')
        else:
            return lemmatizer.lemmatize(token, 'n')
    
    pre_proc_text = ' '.join([prat_lemmatize(token, tag) for token, tag in tagged_corpus])
    return pre_proc_text

In [42]:
print('processing {} training data'.format(len(x_train)))
x_train_preprocessed = []
for i, doc in enumerate(x_train):
    x_train_preprocessed.append(preprocessing(doc))
    if i % len(x_train)//10 == 0:
        print('completed: {}'.format(i))
print('completed preprocessing training data')
        
print('processing {} testing data'.format(len(x_test)))
x_test_preprocessed = []
for i, doc in enumerate(x_test):
    x_test_preprocessed.append(preprocessing(doc))
    if i % len(x_test)//10 == 0:
        print('completed: {}'.format(i))
print('completed preprocessing testing data')

processing 11314 training data
completed: 0
completed: 100
completed: 200
completed: 300
completed: 400
completed: 500
completed: 600
completed: 700
completed: 800
completed: 900
completed: 1000
completed: 1100
completed: 1200
completed: 1300
completed: 1400
completed: 1500
completed: 1600
completed: 1700
completed: 1800
completed: 1900
completed: 2000
completed: 2100
completed: 2200
completed: 2300
completed: 2400
completed: 2500
completed: 2600
completed: 2700
completed: 2800
completed: 2900
completed: 3000
completed: 3100
completed: 3200
completed: 3300
completed: 3400
completed: 3500
completed: 3600
completed: 3700
completed: 3800
completed: 3900
completed: 4000
completed: 4100
completed: 4200
completed: 4300
completed: 4400
completed: 4500
completed: 4600
completed: 4700
completed: 4800
completed: 4900
completed: 5000
completed: 5100
completed: 5200
completed: 5300
completed: 5400
completed: 5500
completed: 5600
completed: 5700
completed: 5800
completed: 5900
completed: 6000
compl

In [44]:
# Building TFIDF vectorizer.
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1,2), 
                            stop_words='english',
                            max_features=10_000,
                            strip_accents='unicode',
                            norm='l2')

In [45]:
x_train_2 = vectorizer.fit_transform(x_train_preprocessed).todense()
x_test_2 = vectorizer.transform(x_test_preprocessed).todense()

In [48]:
# Deep learning module.
!pip3 install numpy keras tensorflow
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta, Adam, RMSprop
from keras.utils import np_utils

Collecting tensorflow
  Downloading tensorflow-2.3.0-cp38-cp38-macosx_10_11_x86_64.whl (165.2 MB)
[K     |████████████████████████████████| 165.2 MB 66 kB/s  eta 0:00:011  |█▉                              | 9.3 MB 3.5 MB/s eta 0:00:45     |████                            | 21.1 MB 3.0 MB/s eta 0:00:49     |█████▌                          | 28.5 MB 3.0 MB/s eta 0:00:47     |█████▉                          | 30.1 MB 9.6 MB/s eta 0:00:15     |██████████▌                     | 54.1 MB 48.9 MB/s eta 0:00:03     |████████████████████████▏       | 124.9 MB 8.6 MB/s eta 0:00:05     |████████████████████████▋       | 127.3 MB 8.6 MB/s eta 0:00:05     |████████████████████████████    | 144.6 MB 9.6 MB/s eta 0:00:03     |█████████████████████████████▊  | 153.6 MB 25.3 MB/s eta 0:00:01     |██████████████████████████████▉ | 159.4 MB 40.9 MB/s eta 0:00:01
Collecting astunparse==1.6.3
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloa

In [50]:
# Definition hyper parameters
np.random.seed(1337)
nb_classes = 20
batch_size = 64
nb_epochs = 20

In [51]:
# Converts the 20 categories into one-hot encoding vectors in which 20 columns are created and the values
# against the respective classes are given as 1. All other classes are given as 0.
y_train = np_utils.to_categorical(y_train, nb_classes)

In [52]:
# Deep layer model building in keras.
model = Sequential()
model.add(Dense(1_000, input_shape=(10_000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(500))
model.add(Activation('relu'))

model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              10001000  
_________________________________________________________________
activation (Activation)      (None, 1000)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_1 (Activation)    (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                25050     
_________________________________________________________________
activation_2 (Activation)    (None, 50)                0

In [53]:
# Model training.
model.fit(x_train_2, y_train, batch_size=batch_size, epochs=nb_epochs, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x14dae5d90>

In [105]:
# Model prediction.
y_train_predclass = model.predict(x_train_2, batch_size=batch_size)

In [106]:
y_test_predclass = model.predict(x_test_2, batch_size=batch_size)

In [110]:
from sklearn.metrics import accuracy_score, classification_report

In [121]:
y_train_pred_result = np_utils.to_categorical(np.argmax(y_train_predclass, axis=-1), nb_classes)
print('Deep neural network - train accuracy: {}'.format(round(accuracy_score(y_train, y_train_pred_result), 3)))

Deep neural network - train accuracy: 0.999


In [122]:
y_test_pred_result = np.argmax(y_test_predclass, axis=-1)
print('Deep neural network - test accuracy: {}'.format(round(accuracy_score(y_test, y_test_pred_result), 3)))

Deep neural network - test accuracy: 0.815


In [123]:
print('Deep neural network - train classification report')
print(classification_report(y_train, y_train_pred_result))

Deep neural network - train classification report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       480
           1       1.00      1.00      1.00       584
           2       1.00      1.00      1.00       591
           3       1.00      1.00      1.00       590
           4       1.00      1.00      1.00       578
           5       1.00      1.00      1.00       593
           6       1.00      1.00      1.00       585
           7       1.00      1.00      1.00       594
           8       1.00      1.00      1.00       598
           9       1.00      1.00      1.00       597
          10       1.00      1.00      1.00       600
          11       1.00      1.00      1.00       595
          12       1.00      1.00      1.00       591
          13       1.00      1.00      1.00       594
          14       1.00      1.00      1.00       593
          15       1.00      1.00      1.00       599
          16       1.00      1.

In [124]:
print('Deep neural network - test classification report')
print(classification_report(y_test, y_test_pred_result))

Deep neural network - test classification report
              precision    recall  f1-score   support

           0       0.84      0.76      0.80       319
           1       0.68      0.75      0.71       389
           2       0.77      0.65      0.71       394
           3       0.63      0.73      0.68       392
           4       0.73      0.81      0.77       385
           5       0.84      0.76      0.80       395
           6       0.81      0.83      0.82       390
           7       0.87      0.86      0.87       396
           8       0.91      0.93      0.92       398
           9       0.86      0.93      0.89       397
          10       0.94      0.96      0.95       399
          11       0.94      0.88      0.91       396
          12       0.75      0.69      0.72       393
          13       0.89      0.83      0.86       396
          14       0.87      0.91      0.89       394
          15       0.86      0.89      0.87       398
          16       0.72      0.8