In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from collections import Counter, OrderedDict

import torch
from torch.utils.data import DataLoader
from torchtext.vocab import vocab
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split

from bounce_text_dataset import BounceTextDataset
import encoding
import models
import evaluation
from train import fit

In [2]:
# fixed random seeds
torch.manual_seed(42)
np.random.seed(42)

# Load Data

In [3]:
data_path = Path() / 'data' / 'labeled_data_25_3.pickle'
df = pd.read_pickle(data_path)
print(f'loaded {df.shape[0]} error texts in total')
unique_df = df.drop_duplicates('errorTextFormatted')
print(f'loaded {unique_df.shape[0]} unique (fomatted) error texts')

loaded 3482897 error texts in total
loaded 1590 unique (fomatted) error texts


In [20]:
unique_df[["Email", "errorTextFull", "errorTextFormatted", "label"]]

Unnamed: 0,Email,errorTextFull,errorTextFormatted,label
0,b9000cd871972b45bcfaffca7339420ecc0b9e728c3253...,Message queued (session with SMTP Server: #HOS...,message queued session with smtp server #hostn...,NoTransport
5,bc38b6679d445db7317a0049cd232b9899b6e89e4ba67a...,452 4.2.2 https://support.google.com/mail/?p=O...,452 4.2.2 https://support.google.com/mail/?p=o...,Mailbox full
16,c11a14931eb54d4f7c789026e774ee5f61a5cb48170492...,"End body tag detection recovery, opening track...",end body tag detection recovery opening tracki...,NoTransport
31,d4a214d875729f5ec0fc9f196e86218144ffd4f502a8a7...,Messages queued (connection quotas met).,messages queued connection quotas met,NoTransport
42,6279a031901d9b512ac6e8b5f338ed40a22c1f1b8d0239...,491 Proxy connection to MX #IP# failed,491 proxy connection to mx #ip# failed,NoTransport
...,...,...,...,...
3310501,34479b410946f40624e09761c1b81f982e5795244ee917...,530 Must issue a STARTTLS command first.,530 must issue a starttls command first,NoTransport
3310840,c9eff36e40c18dac473180f11d785e3a2ba88447a68e80...,530 Must send STARTTLS before DATA,530 must send starttls before data,NoTransport
3356451,2a781496d816c4c1c8a65aced5f8eceb41b58a26e6c8cd...,"421 Unexpected failure, please try later",421 unexpected failure please try later,ServerTempError
3370579,6911c04fe9bd1b75f2fe06e13d06536efcace7b39d2f7b...,451 4.1.1 PK6glfq04eypnPK6glGR2O <*> Mailbox b...,451 4.1.1 pk6glfq04eypnpk6glgr2o #id# mailbox ...,ServerTempError


# Split data (only train, test)

In [4]:
X = np.array(unique_df['errorTextFormatted'].values)
Y = unique_df['label'].values
Y = np.array(list(map(encoding.encode_combined_label, Y)))

In [5]:
x_val = np.array([])
y_val = np.array([])
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=30, stratify=Y, test_size=0.25)
x_val = x_test.copy()
y_val = y_test.copy()

print(f'train    items: {x_train.shape[0]}')
print(f'val/test items: {x_val.shape[0]}')

train    items: 1192
val/test items: 398


In [6]:
counter = Counter()
#v = list(x_train) + list(x_val) + list(x_test)
error_texts = list(x_train)
for err in error_texts:
    counter.update(err.split(' '))
    
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

bounce_vocab = vocab(ordered_dict, specials=['<unk>', '<emp>'])
bounce_vocab.set_default_index(0)
print(f' loaded #{len(bounce_vocab)} unique tokens')

 loaded #1879 unique tokens


In [7]:
itos_map = bounce_vocab.get_itos()
bounce_vocab['dlkngklngfkn'], itos_map[0], itos_map[1], itos_map[2]

(0, '<unk>', '<emp>', '550')

In [8]:
#for token in v1.get_stoi():
#    print(token)

# Train

In [9]:
###
### Parameters & Model
###

num_classes = encoding.num_combined_classes()
vocab_size = len(bounce_vocab)
embedding_dim = 16
input_len = 40

evaluator = evaluation.EpochEvaluator()
model = models.EmbeddingModel(vocab_size, embedding_dim, num_classes, input_len=input_len)
print(model)


###
### Hyperparameters
###
learning_rate = 0.01
epochs = 200
batch_size = 8

optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
criterion = torch.nn.CrossEntropyLoss()

###
### Data
###
train_ds = BounceTextDataset(x_train, y_train, bounce_vocab, text_len=padded_len)
val_ds = BounceTextDataset(x_val, y_val, bounce_vocab, text_len=padded_len)
test_ds = BounceTextDataset(x_test, y_test, bounce_vocab, text_len=padded_len)

train_loader = DataLoader(train_ds, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_ds, batch_size=1)
test_loader = DataLoader(test_ds, batch_size=1)

dataloaders = {'train': train_loader, 'val': val_loader}

model = fit(model, dataloaders, epochs, optimizer, criterion, evaluator)

EmbeddingModel(
  (embedding): Embedding(1879, 16)
  (fc): Linear(in_features=640, out_features=128, bias=True)
  (relu1): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
  (fc2): Linear(in_features=128, out_features=8, bias=True)
)
train Epoch 1 loss: 3.386544 time: 0.391579ms                    pre: 0.17 rec: 0.17 f1: 0.17 acc: 0.24 items: 1192
val   Epoch 1 loss: 1.996802 time: 0.827716ms                    pre: 0.28 rec: 0.28 f1: 0.28 acc: 0.41 items: 398
train Epoch 2 loss: 1.818796 time: 0.181366ms                    pre: 0.34 rec: 0.33 f1: 0.33 acc: 0.45 items: 1192
val   Epoch 2 loss: 1.016108 time: 0.520382ms                    pre: 0.59 rec: 0.55 f1: 0.56 acc: 0.67 items: 398
train Epoch 3 loss: 1.556477 time: 0.198436ms                    pre: 0.42 rec: 0.40 f1: 0.41 acc: 0.52 items: 1192
val   Epoch 3 loss: 0.677084 time: 0.522126ms                    pre: 0.76 rec: 0.70 f1: 0.71 acc: 0.79 items: 398
train Epoch 4 loss: 1.442157 time: 0.170531ms                    pre: 0.

val   Epoch 35 loss: 0.034341 time: 0.529382ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 36 loss: 0.393433 time: 0.202159ms                    pre: 0.83 rec: 0.81 f1: 0.82 acc: 0.86 items: 1192
val   Epoch 36 loss: 0.033733 time: 0.537979ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 37 loss: 0.401520 time: 0.218249ms                    pre: 0.82 rec: 0.80 f1: 0.81 acc: 0.86 items: 1192
val   Epoch 37 loss: 0.031786 time: 0.604982ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 38 loss: 0.407774 time: 0.207772ms                    pre: 0.81 rec: 0.80 f1: 0.80 acc: 0.86 items: 1192
val   Epoch 38 loss: 0.031476 time: 0.541352ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 39 loss: 0.371361 time: 0.224959ms                    pre: 0.82 rec: 0.79 f1: 0.80 acc: 0.87 items: 1192
val   Epoch 39 loss: 0.030658 time: 0.542775ms                    pr

val   Epoch 71 loss: 0.017693 time: 0.385487ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 72 loss: 0.183146 time: 0.137914ms                    pre: 0.92 rec: 0.90 f1: 0.91 acc: 0.94 items: 1192
val   Epoch 72 loss: 0.017578 time: 0.375729ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 73 loss: 0.164809 time: 0.136660ms                    pre: 0.93 rec: 0.93 f1: 0.93 acc: 0.95 items: 1192
val   Epoch 73 loss: 0.018173 time: 0.382876ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 74 loss: 0.168218 time: 0.139347ms                    pre: 0.93 rec: 0.92 f1: 0.93 acc: 0.94 items: 1192
val   Epoch 74 loss: 0.017402 time: 0.373820ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 75 loss: 0.146101 time: 0.145219ms                    pre: 0.95 rec: 0.92 f1: 0.93 acc: 0.95 items: 1192
val   Epoch 75 loss: 0.017393 time: 0.404416ms                    pr

val   Epoch 107 loss: 0.013744 time: 0.373911ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 108 loss: 0.089898 time: 0.139690ms                    pre: 0.96 rec: 0.96 f1: 0.96 acc: 0.97 items: 1192
val   Epoch 108 loss: 0.013308 time: 0.386144ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 109 loss: 0.116258 time: 0.184064ms                    pre: 0.95 rec: 0.94 f1: 0.94 acc: 0.96 items: 1192
val   Epoch 109 loss: 0.013550 time: 0.478471ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 110 loss: 0.084976 time: 0.153650ms                    pre: 0.96 rec: 0.96 f1: 0.96 acc: 0.98 items: 1192
val   Epoch 110 loss: 0.013506 time: 0.393956ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 111 loss: 0.094431 time: 0.136730ms                    pre: 0.97 rec: 0.97 f1: 0.97 acc: 0.97 items: 1192
val   Epoch 111 loss: 0.013436 time: 0.417725ms             

val   Epoch 142 loss: 0.012632 time: 0.391943ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 143 loss: 0.055642 time: 0.138203ms                    pre: 0.98 rec: 0.98 f1: 0.98 acc: 0.98 items: 1192
val   Epoch 143 loss: 0.011648 time: 0.383697ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 144 loss: 0.058168 time: 0.138120ms                    pre: 0.97 rec: 0.98 f1: 0.98 acc: 0.98 items: 1192
val   Epoch 144 loss: 0.011534 time: 0.363893ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 1.00 items: 398
train Epoch 145 loss: 0.058116 time: 0.139781ms                    pre: 0.98 rec: 0.98 f1: 0.98 acc: 0.98 items: 1192
val   Epoch 145 loss: 0.011598 time: 0.371905ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 0.99 items: 398
train Epoch 146 loss: 0.060794 time: 0.148755ms                    pre: 0.98 rec: 0.97 f1: 0.98 acc: 0.99 items: 1192
val   Epoch 146 loss: 0.011413 time: 0.388809ms             

val   Epoch 177 loss: 0.010033 time: 0.377951ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 1.00 items: 398
train Epoch 178 loss: 0.043588 time: 0.139741ms                    pre: 0.99 rec: 0.98 f1: 0.98 acc: 0.99 items: 1192
val   Epoch 178 loss: 0.009815 time: 0.366613ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 1.00 items: 398
train Epoch 179 loss: 0.043811 time: 0.139283ms                    pre: 0.99 rec: 0.99 f1: 0.99 acc: 0.99 items: 1192
val   Epoch 179 loss: 0.010386 time: 0.375805ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 1.00 items: 398
train Epoch 180 loss: 0.033814 time: 0.141334ms                    pre: 0.99 rec: 0.99 f1: 0.99 acc: 0.99 items: 1192
val   Epoch 180 loss: 0.010069 time: 0.374133ms                    pre: 1.00 rec: 1.00 f1: 1.00 acc: 1.00 items: 398
train Epoch 181 loss: 0.038625 time: 0.139715ms                    pre: 0.99 rec: 0.97 f1: 0.98 acc: 0.99 items: 1192
val   Epoch 181 loss: 0.010009 time: 0.372738ms             

# Evaluation

## Train data, just for curiosity

In [10]:
labels, predicted = evaluation.eval_model(model, train_loader, bounce_vocab, encoding.decode_combined_label)
print(sklearn.metrics.classification_report(labels, predicted, target_names=encoding.combined_target_names()))

0it [00:00, ?it/s]

                      precision    recall  f1-score   support

        User unknown       1.00      1.00      1.00       309
TemporaryUserProblem       1.00      1.00      1.00        24
        Mailbox full       1.00      1.00      1.00        77
                Spam       1.00      1.00      1.00       225
              Policy       1.00      1.00      1.00        73
    TransportProblem       1.00      1.00      1.00       242
        Unclassified       1.00      0.99      1.00       175
          Greylisted       0.99      1.00      0.99        67

            accuracy                           1.00      1192
           macro avg       1.00      1.00      1.00      1192
        weighted avg       1.00      1.00      1.00      1192



## Test

In [11]:
labels, predicted = evaluation.eval_model(model, test_loader, bounce_vocab, encoding.decode_combined_label)
print(sklearn.metrics.classification_report(labels, predicted, target_names=encoding.combined_target_names()))

0it [00:00, ?it/s]

Wrong prediction
text: inbound email bounce rule <unk> has matched this bounce <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp> <emp>
real class: Unclassified
predicted class: User unknown
                      precision    recall  f1-score   support

        User unknown       0.99      1.00      1.00       104
TemporaryUserProblem       1.00      1.00      1.00         8
        Mailbox full       1.00      1.00      1.00        26
                Spam       1.00      1.00      1.00        75
              Policy       1.00      1.00      1.00        24
    TransportProblem       1.00      1.00      1.00        81
        Unclassified       1.00      0.98      0.99        58
          Greylisted       1.00      1.00      1.00        22

            accuracy                           1.00       398
           macro avg       1.00      1.00      1.00       

# Playground

In [30]:
import torch.nn as nn
from torch import tensor

In [31]:
embedding = nn.Embedding(5, 3)

In [32]:
embedding(tensor([0, 4]))

tensor([[-0.4112,  0.2312,  0.3912],
        [ 0.0129, -0.3718,  0.9632]], grad_fn=<EmbeddingBackward0>)

In [28]:
[w for w in embedding.parameters()]

[Parameter containing:
 tensor([[ 0.0126,  1.5022, -0.9710],
         [ 1.3640,  0.2301, -1.5133],
         [ 1.5845, -0.9093,  0.0115],
         [-0.7000, -0.4967, -0.4074],
         [-1.2437,  0.8110,  1.3576]], requires_grad=True)]