In [263]:
import os
import yaml
import string
import numpy as np
import multiprocessing as mp

In [264]:
import plotly.graph_objs as go
import plotly.offline as plotly
plotly.init_notebook_mode(connected=True)

In [3]:
from simple_state_recurrent_model.model import load_saved_model, SimpleRecurrentModel


Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.

Using TensorFlow backend.


In [210]:
REPO_PATH = '/home/fwang/projects/project-borrowbot/data_model_repo'
DATA_PATH = os.path.join(REPO_PATH, 'datasets/money_detection')
MODEL_PATH = os.path.join(REPO_PATH, 'models/money_detection/money_detection_model_2019-05-02')

In [211]:
window_size = 6
alphabet = '0123456789abcdefghijklmnopqrstuvwxyz!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ €£'

# Get Working Model

## Pulling Data

In [6]:
def get_data(data_path=DATA_PATH):
    inputs = []
    labels = []
    
    for file in os.listdir(data_path):
        with open(os.path.join(data_path, file), 'r') as f:
            raw_file = yaml.load(f)
            inputs += [inp['input'] for inp in raw_file]
            labels += [lab['labels'] for lab in raw_file]    
            
    return inputs, labels

In [7]:
inputs, labels = get_data()

## Training Model

In [212]:
model = SimpleRecurrentModel(6, alphabet, window_shift=1, preprocess=lambda x: x.lower())
model.train(inputs, labels, 32, 0.1, 400000)
model.save(MODEL_PATH)

## Load a Model

In [6]:
model = load_saved_model(MODEL_PATH)

# Performance Metrics

In [8]:
def loo_x_validate(loo_cand):
    model = SimpleRecurrentModel(6, alphabet, window_shift=1)
    model.train(
        [inputs[i] for i in range(len(inputs)) if i != loo_cand],
        [labels[i] for i in range(len(labels)) if i != loo_cand],
        32, 0.2, 500000
    )
    
    return model.compute_inference(inputs[loo_cand])

In [10]:
loo_candidates = range(len(inputs))
p = mp.Pool(6)
results = p.map(loo_x_validate, loo_candidates)
p.close()

In [11]:
accuracy_curve = []
for i in range(151): 
    thresh = i / 150.0
    d = []
    for r in range(len(results)):
        model_pos = np.where(np.array(results[r]) > thresh)[0]
        target_pos = [i for i in range(len(inputs[r])) if any([i >= j[0] and i < j[1] for j in labels[r]])]
        d.append(set(model_pos) == set(target_pos))
    accuracy_curve.append(sum(d) / len(d))

In [12]:
plotly.iplot([
    go.Scatter(
        y=accuracy_curve,
        x=np.arange(len(accuracy_curve)) / (len(accuracy_curve) - 1),
        mode='lines'
    )
])

# Output Spot Checks

In [213]:
s1 = "Hey Angela, I just wanted to check the status of the $635 invoice we sent yesterday."
s1_output = model.compute_inference(s1.lower())
''.join([s1[i] for i in range(len(s1)) if s1_output[i] > 0.5])

'635'

In [214]:
s2 = "The first 200 was sent yesterday. I think the remaining $435 will be paid on 06/17/2019."
s2_output = model.compute_inference(s2.lower())
''.join([s2[i] for i in range(len(s2)) if s2_output[i] > 0.5])

'200435'

# Interpreting Model Weights

In [15]:
c = '0'
mod_idx = alphabet.find(c)
model.weights[[len(alphabet) * i + mod_idx for i in range(window_size) ]]

array([-6.67435867, 10.09942135,  6.04784578,  5.98293817,  2.2630126 ,
        0.1991997 ])

In [167]:
import colorlover as cl

In [169]:
reds = cl.scales['3']['seq']['Reds']
greens = cl.scales['3']['seq']['Greens']
greens.reverse()
color_scale = list(zip(
    [i / 6.0 for i in range(7)],
    reds + ['rgb(255, 255, 255)'] + greens
))
color_scale = [list(c) for c in color_scale]

In [16]:
plotly.iplot([go.Heatmap(
    z=model.weights[:426].reshape((window_size, len(alphabet)))[:,:10],
    colorscale='Portland'
)])

In [23]:
for inp in inputs:
    x = np.array(model.compute_inference(inp.lower())) > 0.5
    print(inp)
    print(''.join([inp[i] for i in range(len(inp)) if x[i] > 0.5]))

[paid] (/u/riveted) - (100 euros), (on time)
100
[req] (1000€) (#illle-sur-tet, languedoc-rossillon, france) (1196€ repayment) (paypal)
10001196
[req] (80) - (largo, fl, usa), (repay $100 on 10/12/2018), (paypal)
80100
[paid] (u/verydisappointing) ($180 gbp + int.) (early)
180
[req] (100$) (#copenhagen, hovedstaden, denmark) (150$) (1/1/19) (paypal)
100150
[paid] (/u/swellymm) - (25) (on time)
25
[req] (200$) - (#va beach, va, usa), (november 9th, 2018), (paypal)
200
[req] (400) - (portland, or, us), (repay 475 11/30), (paypal, google)
400475
[req] (euro900 ) - (#kilrickle loughrea co galway ireland), (nov 9 2018), (paypal )
900
[req] (2500 usd ) - (#millville, nj, usa), (12/21/2018), (paypal)
2500
(/u/throwapair) - ($400)(66 days late)
400
[req] (£150) - (#belfast, ni, uk), (£275 on 02/11/18), (paypal) 
150275
[req] 500 - (#kennesaw, ga, united states), (11/9/2018), paypal/venmo
500
[req] ($750.00) - (#houston, tx, usa), (repay $1,000 as 10/24 $200, 11/24 $200, 12/24 $200, 1/24 $200, 

# Evaluation

In [8]:
from simple_state_recurrent_model.evaluation import Evaluator

In [9]:
evaluator = Evaluator(
    {'window_size': 6, 'alphabet': alphabet, 'window_shift': 1, 'preprocess': lambda x: x.lower()},
    inputs,
    labels
)

In [10]:
%%time
_ = evaluator.loo_cross_validation(steps=200, epochs_per_step=2000, threads=1)

CPU times: user 31.4 s, sys: 8.67 s, total: 40 s
Wall time: 17h 3min 46s


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


## Timing Comparison

In [281]:
%time
def compute_accuracy_curve(self, resolution):
    if self.loo_cross_validation_results is None:
        raise Exception("loo_cross_validation results not yet computed")
    keys = self.loo_cross_validation_results.keys()
    accuracy_curve = {k: [] for k in keys}

    for k in keys:
        for i in range(resolution + 1):
            thresh = i / float(resolution)
            d = []
            for r in range(len(self.loo_cross_validation_results[k])):
                model_pos = np.where(np.array(self.loo_cross_validation_results[k][r]) > thresh)[0]
                target_pos = [i for i in range(len(self.input_data[r])) if any([i >= j[0] and i < j[1] for j in self.target_data[r]])]
                d.append(set(model_pos) == set(target_pos))
            accuracy_curve[k].append(sum(d) / len(d))

    self.accuracy_curve_results = accuracy_curve
    return self.accuracy_curve_results

_ = compute_accuracy_curve(evaluator, 10)

In [None]:
%time
from simple_state_recurrent_model.model import SimpleRecurrentModel

def compute_accuracy_curve(self, resolution, threads=4):
    if self.loo_cross_validation_results is None:
        raise Exception("loo_cross_validation results not yet computed")

    def compute_accuracy_curve_one_epcoh(k):
        accuracy_curve = []
        for i in range(resolution + 1):
            thresh = i / float(resolution)
            d = []
            for r in range(len(self.loo_cross_validation_results[k])):
                model_pos = np.where(np.array(self.loo_cross_validation_results[k][r]) > thresh)[0]
                target_pos = [
                    i for i in range(len(self.input_data[r]))
                    if any([i >= j[0] and i < j[1] for j in self.target_data[r]])
                ]
                d.append(set(model_pos) == set(target_pos))
            accuracy_curve.append(sum(d) / len(d))
        return k, accuracy_curve

    keys = self.loo_cross_validation_results.keys()
    p = mp.Pool(threads)
    results = p.map(compute_accuracy_curve_one_epcoh, keys)
    p.close()

    accuracy_curves = {k: [] for k in keys}
    for k, acc_curve in results:
        accuracy_curves[k] = acc_curve
    self.accuracy_curve_results = accuracy_curves
    return accuracy_curves
    
_ = compute_accuracy_curve(evaluator, 10)

## Plots 

In [90]:
mean_data = [
    np.array([evaluator.accuracy_curve_results[k][i] for k in evaluator.accuracy_curve_results.keys()]).mean()
    for i in range(len(evaluator.accuracy_curve_results[2000]))
]
stddev_data = [
    np.array([evaluator.accuracy_curve_results[k][i] for k in evaluator.accuracy_curve_results.keys()]).std()
    for i in range(len(evaluator.accuracy_curve_results[2000]))
]
max_data = [
    np.array([evaluator.accuracy_curve_results[k][i] for k in evaluator.accuracy_curve_results.keys()]).max()
    for i in range(len(evaluator.accuracy_curve_results[2000]))
]

In [269]:
import plotly.offline as plotly_
import plotly.graph_objs as go_

In [277]:
frames = [
    go_.Scatter(
        y=800*np.array(stddev_data)**3,
        x=list(range(len(evaluator.accuracy_curve_results[2000]))),
        mode='lines',
        name='spread',
        line={'smoothing': 1000, 'shape': 'spline','color':'rgba(26,150,65,0.1)'},
        fillcolor='rgba(26,150,65,0.05)',
        fill='tonexty',
    ),
    go_.Scatter(
        y=mean_data,
        x=list(range(len(evaluator.accuracy_curve_results[2000]))),
        mode='lines',
        line={'smoothing': 500, 'shape': 'spline', 'color':'rgb(35,159,255)'},
        name='mean',
    ),
    go_.Scatter(
        y=max_data,
        x=list(range(len(evaluator.accuracy_curve_results[2000]))),
        mode='lines',
        line={'smoothing': 500, 'shape': 'spline', 'color':'rgba(255,89,12, 0.3)'},
        name='max',
    ),
]
layout = go_.Layout(
    title=dict(text='Accuracy During Training at Different Thresholds', x=0.1),
    xaxis=dict(
        range=[-1, 101], zeroline=False, title="Threshold",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    yaxis=dict(
        zeroline=False, title="Accuracy",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    legend=dict(
        x=0.05, y=0.15,
        font=dict(family='sans-serif', size=10, color='#727272'),
        bgcolor='rgba(0,0,0,0)'
    ),
    titlefont=dict(family='sans serif', size=14, color='#727272')
)
fig = go_.Figure(data=frames, layout=layout)
plotly_.iplot(fig)

In [278]:
plotly_.plot(fig, image_filename='threshold_selection', image='svg')

'file:///home/fwang/projects/project-borrowbot/research/2019Q2/temp-plot.html'

In [273]:
frames = [
    go_.Scatter(
        y=[evaluator.accuracy_curve_results[k][i * 20] for k in evaluator.accuracy_curve_results.keys()],
        x=np.array(range(len(evaluator.accuracy_curve_results[2000]))) * 2000,
        mode='lines',
        opacity=0.4,
        line={'smoothing': 1000, 'shape': 'spline'},
        name="{} threshhold".format(i / 5)
    )
    for i in range(1, 5)
]
layout = go_.Layout(
    title=dict(text='Accuracy During Training at Different Thresholds', x=0.1),
    xaxis=dict(
        zeroline=False, title="Epochs",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    yaxis=dict(
        zeroline=False, title="Test Accuracy",
        titlefont=dict(family='sans serif', size=10, color='#727272')),
    legend=dict(
        x=0.05, y=0.15,
        font=dict(family='sans-serif', size=10, color='#727272'),
        bgcolor='rgba(0,0,0,0)'),
    titlefont=dict(family='sans serif', size=14, color='#727272')
)
fig = go_.Figure(data=frames, layout=layout)
plotly_.iplot(fig)

In [272]:
plotly_.plot(fig, image_filename='accuracy_epochs.svg', image='svg')

'file:///home/fwang/projects/project-borrowbot/research/2019Q2/temp-plot.html'