In [254]:
import plotly.offline as plotly
import plotly.graph_objs as go

In [447]:
import autograd.numpy as np
from autograd import grad  
import string
import math
import os
import json

alphabet = string.printable[:95] + '€£'
num_chars = len(alphabet)
window_size = 6

In [193]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [255]:
plotly.init_notebook_mode(connected=True)

In [1]:
from simple_state_recurrent_model.src.model import load_saved_model

In [2]:
m = load_saved_model('/home/fwang/projects/project-borrowbot/simple_state_recurrent_model/simple_state_recurrent_model/models/money_detector/money_detection_model_2019-04-07')

# Model

In [443]:
class SimpleRecurrentModel(object):
    def __init__(self, num_chars, window_size, alphabet, window_shift=0):
        self.num_chars = num_chars
        self.window_size = window_size
        self.alphabet = alphabet
        self.window_shift = window_shift
        self.weights = np.random.normal(1, size=(num_chars * window_size + 2))
        
        
    def train(self, train_inputs, train_labels, batch_size, learning_rate, steps):
        def loss(params, sample_inputs, sample_labels):
            output = SimpleRecurrentModel._raw_inference(params, sample_inputs)
            return -np.sum(output * sample_labels + (1 - output) * (1 - sample_labels))
        loss_grad = grad(loss)
        
        for _ in range(steps):
            sample = np.random.choice(np.arange(formatted_labels.shape[0]), batch_size)
            sample_inputs = train_inputs[sample]
            sample_labels = train_labels[sample]
            self.weights = self.weights - learning_rate * loss_grad(self.weights, sample_inputs, sample_labels)
        
        
    def compute_inference(self, input_str):
        last_activation = 0
        results = []
        
        for j in range(len(input_str)):
            start_idx, end_idx = self._get_index_ranges(j)
            pre_padding, post_padding = self._get_padding(start_idx, end_idx, len(input_str))
            vector = self._string_vectorizer(input_str[max(0, start_idx):end_idx])
            
            last_activation = SimpleRecurrentModel._raw_inference(
                self.weights,
                np.concatenate((pre_padding, vector, post_padding, [last_activation, 1]))
            )
            results.append(last_activation)
            
        return results
    
    
    def _string_vectorizer(self, string):
        vector = [
            [0 if char != letter else 1 for char in self.alphabet] 
            for letter in string
        ]
        return np.array(vector).flatten()

    
    def _get_index_ranges(self, inference_index):
        return inference_index - self.window_shift, inference_index + self.window_size - self.window_shift
    
    
    def _get_padding(self, start_idx, end_idx, len_input_str):
        pre_padding = np.zeros(-min(0, start_idx) * len(self.alphabet))
        post_padding = np.zeros(max(0, end_idx - len_input_str) * len(self.alphabet))
        return pre_padding, post_padding
    
    
    def save(self, file_dir):
        if not os.path.exists(file_dir):
            os.mkdir(file_dir)

        np.save(os.path.join(file_dir, 'weights.npy'), self.weights)

        with open(os.path.join(file_dir, 'metadata'), 'w') as f:
            json.dump({
                'num_chars': self.num_chars,
                'window_size': self.window_size,
                'alphabet': self.alphabet,
                'window_shift': self.window_shift
            }, f)
    
    
    @staticmethod
    def _raw_inference(params, inputs):
        output = np.matmul(inputs, params)
        return np.exp(output) / (1 + np.exp(output))

In [249]:
m = SimpleRecurrentModel(num_chars, window_size, alphabet)

# Data Assembly

In [397]:
inputs = [
    '[REQ] ($1000 over 2 people?) - (#Gilbert, AZ, USA), (Monthly payments of 125), (PayPal or present post dated checks beginning July 2) - Repay $1250/loan',
    '[REQ] (£20) - (#Essex, UK), (£26 on 16th july 2017), (Paypal)',
    '[REQ] (1000.00) - (#Saco, Maine, USA), (1250.00 by 9/30/17), (Paypal)',
    '[REQ] (3,000) - (#Fort Dodge, Iowa, U.S.), (03/01/18), (PayPal)',
    '[REQ] (£20) (MCR uk) , (payback £21 next wednesday) (Paypal)',
    '[REQ] ($30) - (#New Orleans, Louisiana, United States), (Later tonight or tomorrow), (Bitcoin)',
    '[REQ] (40 cad to get home!) - (Ontario, Canada), (50 cad on July 7th), (Bank Transfer)',
    '[REQ] ($25) #kyle, Texas, USA (pay back 6/30/17) (square cash)',
    '[REQ] ($1850) (July 10th) (#Homestead, Florida, USA)',
    '[REQ] ($11.00 ) - (#Charlotte, Nc, Usa), (repaid by 6/29/17), (paypal)',
    '[REQ] (15USD ) - (Sydney, NSW, Australia), (June 8), (Paypal)',
    '[REQ] (£6,000 ) - (Brighton, UK), (June 2018), (PayPal)',
    '[REQ] (US $2,000 ) - (#Buffalo, New York, USA), (August 31, 2017), (Paypal)',
    '[REQ] ($155) - (#Lexington, KY US), ($170.50 on 06/30/17), (PayPal)',
    '[REQ] (£160) - (#Manchester, UK), (18/06/16), (Pre-Arranged - paying back £200)',
    '[REQ] ($200) - (Chattanooga, TN, US), (7/17/17), (1 payment of $100 and 2 payments of $50)',
    '[REQ] ($25), (#panama city, Fl, Usa), (repay $35 on or before Sunday June 11th), (PayPal)',
    '[REQ] (#Coventry,Uk) (Paypal or BT) (£100)/$125 payback £125/$150)(01/07/17)',
    '[REQ] ($4891) - (#Columbus, GA, USA), (NOV 15, 2019), (Paypal)',
    '[REQ] ($550) - (#Macon, GA, USA), ($600 total June 26th $300 and July 9th $300), (Paypal)',
    '[REQ] (120.00 ) - (#Tucson, Arizona), (6/16), ($135.90 Pre-Arranged)',
    '[REQ] ($1260) - (#Indianapolis, IN, USA), (8/21/17), (Paypal)',
    '[REQ]-($700.00) (Queen Creek, Arizona, USA) (payback 1000.00 by August 11th 2017) (Verified PayPal)'
]
labels =[
    [[8, 12], [73, 76], [143, 147]],
    [[8, 10], [30, 32]],
    [[7, 14], [40, 47]],
    [[7, 12]],
    [[8, 10], [33, 35]],
    [[8, 10]],
    [[7, 9], [50, 52]],
    [[8, 10]],
    [[8, 12]],
    [[8, 13]], 
    [[7, 9]],
    [[8, 13]],
    [[11, 16]],
    [[8, 11], [38, 44]],
    [[8, 11], [75, 78]],
    [[8, 11], [64, 67], [87, 89]],
    [[8, 10], [46, 48]],
    [[38, 41], [44, 47], [57, 60], [62, 65]],
    [[8, 12]],
    [[8, 11], [36, 39], [57, 60], [75, 78]],
    [[7, 13], [48, 54]],
    [[8, 12]],
    [[8, 14], [53, 60]]
]

In [289]:
def assemble_data(model, inputs, labels, output_markers=False):
    assert len(inputs) == len(labels)
    
    formatted_inputs = []
    formatted_labels = []
    
    for i in range(len(inputs)):
        last_label = False
        for j in range(len(inputs[i])):
            start_idx, end_idx = model._get_index_ranges(j)
            pre_padding, post_padding = model._get_padding(start_idx, end_idx, len(inputs[i]))
            vector = model._string_vectorizer(inputs[i][max(0, start_idx):end_idx])
            
            formatted_inputs.append(np.concatenate((pre_padding, vector, post_padding, [last_label, 1])))
            last_label = any([j >= interval[0] and j < interval[1] for interval in labels[i]])
            formatted_labels.append(int(last_label))
            
            if output_markers:
                print(inputs[i][max(0, start_idx):end_idx])
                print(int(last_label))
            
    return np.array(formatted_inputs), np.array(formatted_labels)

In [398]:
# checking dataset correctness
for i in range(len(inputs)):
    for j in range(len(labels[i])):
        print(inputs[i][labels[i][j][0]:labels[i][j][1]])

1000
125
1250
20
26
1000.00
1250.00
3,000
20
21
30
40
50
25
1850
11.00
15
6,000
2,000
155
170.50
160
200
200
100
50
25
35
100
125
125
150
4891
550
600
300
300
120.00
135.90
1260
700.00
1000.00


In [399]:
formatted_inputs, formatted_labels = assemble_data(m, inputs, labels)

# Train Model

In [427]:
m.train(formatted_inputs, formatted_labels, 20, 0.1, 300000)

In [8]:
# spot testing the trained model
s = 'Hey man, do you have the 2000 you owe me from June 4th? I think thats all thats left from 2017'
output = m.compute_inference(s)

for i in range(len(s)):
    if output[i] > 0.1:
        print(s[i])

2
0
0
0
