In [2]:
import yaml
import os
import random
import numpy as np
import pathos.multiprocessing as mp

from simple_state_recurrent_model.model import SimpleRecurrentModel
from simple_state_recurrent_model.evaluation import Evaluator

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Data

In [3]:
DATA_DIR = '/home/fwang/projects/project-borrowbot/data_model_repo'

In [4]:
data = []
for f in os.listdir(os.path.join(DATA_DIR, 'datasets/money_detection/')):
    with open(os.path.join(DATA_DIR, 'datasets/money_detection/', f), 'r') as fopen:
        data += yaml.load(fopen)
input_data = [d['input'] for d in data]
label_data = [d['labels'] for d in data]

# Setup Augmentor

In [7]:
class Augmentor(object):
    def __init__(self, input_data, label_data):
        self.input_data = input_data
        self.label_data = label_data
        
        self.labels = []
        for i, s in enumerate(self.input_data):
            for label in self.label_data[i]:
                self.labels.append(s[label[0]:label[1]])
                
        self.aug_input_data = []
        self.aug_label_data = []
                
    def substitute(self, input_string, labels):
        new_entries = random.choices(self.labels, k=len(labels))
        end_idx = 0
        ret_s = ''
        ret_l = []
        
        for i, lab in enumerate(labels):
            ret_s += input_string[end_idx:lab[0]]
            ret_s += new_entries[i]
            ret_l.append([len(ret_s) - len(new_entries[i]), len(ret_s)])
            end_idx = lab[1]
            
        ret_s += input_string[end_idx:]
        return ret_s, ret_l
    
    def generate_aug_data(self, n):
        for inp, lab in zip(self.input_data, self.label_data):
            for _ in range(n):
                new_inp, new_lab = self.substitute(inp, lab)
                self.aug_input_data.append(new_inp)
                self.aug_label_data.append(new_lab)

In [8]:
non_zero_num_chars = '123456789'

def count_chars(s, cs):
    count = 0
    for c in s:
        if c in cs:
            count += 1
    return count

def augment_for_money_detection_string(input_string, label):
    new_s = ''
    for idx, c in enumerate(input_string):
        if any([idx >= l[0] and idx < l[1] for l in label]) and c in non_zero_num_chars:
            new_s += random.choice(non_zero_num_chars)
        else:
            new_s += c
    return new_s

def augment_for_money_detection(input_data, label_data, n_vars):
    new_inputs = []
    new_labels = []
    
    for input_string, label in zip(input_data, label_data):
        for _ in range(n_vars):
            new_inputs.append(augment_for_money_detection_string(input_string, label))
            new_labels.append(label)
            
    input_data += new_inputs
    label_data += new_labels

# Setup Model

In [6]:
# Overwriting the loo_cross_validation to do data augmentation as part of training

def loo_cross_validation(self, batch_size=32, train_rate=0.1, steps=1, epochs_per_step=100000, threads=4):
    def loo_x_validate(loo_cand):
        model = SimpleRecurrentModel(**self.model_args)
        filtered_inputs = [self.input_data[i] for i in range(len(self.input_data)) if i != loo_cand]
        filtered_targets = [self.target_data[i] for i in range(len(self.target_data)) if i != loo_cand]
        
        a = Augmentor(filtered_inputs, filtered_targets)
        a.generate_aug_data(10)
        augment_for_money_detection(a.input_data, a.label_data, 5)
        processed_inputs, processed_targets = model.assemble_data(a.aug_input_data, a.aug_label_data)

        training_results = {}
        for s in range(1, steps + 1):
            model._raw_train(processed_inputs, processed_targets, batch_size, train_rate, epochs_per_step)
            training_results[s * epochs_per_step] = model.compute_inference(self.input_data[loo_cand])
        return training_results

    loo_candidates = range(len(self.input_data))
    p = mp.Pool(threads)
    results = p.map(loo_x_validate, loo_candidates)
    p.close()

    processed_results = {}
    for k in results[0].keys():
        processed_results[k] = [r[k] for r in results]
    self.loo_cross_validation_results = processed_results
    return self.loo_cross_validation_results
    
Evaluator.loo_cross_validation = loo_cross_validation

In [1]:
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €£'

def preprocess_fn(s):
    alphabet = '0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €£'
    s = s.lower()
    ret_s = ''
    for c in s:
        if c in alphabet:
            ret_s += c
        else:
            ret_s += '#'
    return ret_s

In [5]:
model_args = {
    'alphabet': alphabet,
    'window_size': 5,
    'preprocess': preprocess_fn,
    'window_shift': 1    
}

In [9]:
evaluator = Evaluator(model_args, input_data, label_data)

In [10]:
_ = evaluator.loo_cross_validation(steps=500, epochs_per_step=2000, threads=6)
_ = evaluator.compute_accuracy_curve(100)

# Plots

In [8]:
import plotly.offline as plotly
import plotly.graph_objs as go
plotly.init_notebook_mode(connected=True)

In [23]:
mean_data = [
    np.array([evaluator.accuracy_curve_results[k][i] for k in evaluator.accuracy_curve_results.keys()]).mean()
    for i in range(len(evaluator.accuracy_curve_results[2000]))
]
stddev_data = [
    np.array([evaluator.accuracy_curve_results[k][i] for k in evaluator.accuracy_curve_results.keys()]).std()
    for i in range(len(evaluator.accuracy_curve_results[2000]))
]
max_data = [
    np.array([evaluator.accuracy_curve_results[k][i] for k in evaluator.accuracy_curve_results.keys()]).max()
    for i in range(len(evaluator.accuracy_curve_results[2000]))
]

In [24]:
frames = [
    go.Scatter(
        y=800*np.array(stddev_data)**3,
        x=list(range(len(evaluator.accuracy_curve_results[2000]))),
        mode='lines',
        name='spread',
        line={'smoothing': 0.5, 'shape': 'spline','color':'rgba(26,150,65,0.1)'},
        fillcolor='rgba(26,150,65,0.05)',
        fill='tonexty',
    ),
    go.Scatter(
        y=mean_data,
        x=list(range(len(evaluator.accuracy_curve_results[2000]))),
        mode='lines',
        line={'smoothing': 0.5, 'shape': 'spline', 'color':'rgb(35,159,255)'},
        name='mean',
    ),
    go.Scatter(
        y=max_data,
        x=list(range(len(evaluator.accuracy_curve_results[2000]))),
        mode='lines',
        line={'smoothing': 0.5, 'shape': 'spline', 'color':'rgba(255,89,12, 0.3)'},
        name='max',
    ),
]
layout = go.Layout(
    title=dict(text='Accuracy During Training at Different Thresholds', x=0.1),
    xaxis=dict(
        range=[-1, 101], zeroline=False, title="Threshold",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    yaxis=dict(
        zeroline=False, title="Accuracy",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    legend=dict(
        x=0.05, y=0.15,
        font=dict(family='sans-serif', size=10, color='#727272'),
        bgcolor='rgba(0,0,0,0)'
    ),
    titlefont=dict(family='sans serif', size=14, color='#727272')
)
fig = go.Figure(data=frames, layout=layout)
plotly.iplot(fig)

In [40]:
plotly.plot(fig, image_filename='threshold_selection', image='svg')

'file:///Users/frankwang/projects/data_model_repo/temp-plot.html'

In [14]:
frames = [
    go.Scatter(
        y=[evaluator.accuracy_curve_results[k][i * 20] for k in evaluator.accuracy_curve_results.keys()],
        x=np.array(range(len(evaluator.accuracy_curve_results[2000]))) * 2000,
        mode='lines',
        opacity=0.4,
        line={'smoothing': 0.8, 'shape': 'spline'},
        name="{} threshhold".format(i / 5)
    )
    for i in range(1, 5)
]
layout = go.Layout(
    title=dict(text='Accuracy During Training at Different Thresholds', x=0.1),
    xaxis=dict(
        zeroline=False, title="Epochs",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    yaxis=dict(
        zeroline=False, title="Test Accuracy",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    legend=dict(
        x=0.05, y=0.15,
        font=dict(family='sans-serif', size=10, color='#727272'),
        bgcolor='rgba(0,0,0,0)'),
    titlefont=dict(family='sans serif', size=14, color='#727272')
)
fig = go.Figure(data=frames, layout=layout)
plotly.iplot(fig)

In [None]:
plotly_.plot(fig, image_filename='accuracy_epochs.svg', image='svg')

# Training Permanant Model

In [9]:
%%time
model = SimpleRecurrentModel(**model_args)

a = Augmentor(input_data, label_data)
a.generate_aug_data(10)
augment_for_money_detection(a.input_data, a.label_data, 5)

CPU times: user 56.1 ms, sys: 0 ns, total: 56.1 ms
Wall time: 54.6 ms


In [10]:
%%time
model.train(a.aug_input_data, a.aug_label_data, batch_size=32, learning_rate=0.1, steps=19000)

CPU times: user 2min 44s, sys: 10min, total: 12min 45s
Wall time: 56.5 s


In [11]:
model.save('/home/fwang/projects/project-borrowbot/data_model_repo/models/money_detection/money_detection_model_2019-06-23')