In [9]:
import autograd.numpy as np
from autograd import grad  
import string
import math

alphabet = string.printable[:95] + '€£'
num_chars = len(alphabet)
window_size = 6

In [193]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Model

In [205]:
class SimpleRecurrentModel(object):
    def __init__(self, num_chars, window_size, alphabet):
        self.num_chars = num_chars
        self.window_size = window_size
        self.alphabet = alphabet
        self.weights = np.random.normal(1, size=(num_chars * window_size + 2))
        
        
    def train(self, train_inputs, train_labels, batch_size, learning_rate, steps):
        def loss(params, sample_inputs, sample_labels):
            output = SimpleRecurrentModel._raw_inference(params, sample_inputs)
            return -np.sum(output * sample_labels + (1 - output) * (1 - sample_labels))
        loss_grad = grad(loss)
        
        for _ in range(steps):
            sample = np.random.choice(np.arange(formatted_labels.shape[0]), batch_size)
            sample_inputs = train_inputs[sample]
            sample_labels = train_labels[sample]
            self.weights = self.weights - learning_rate * loss_grad(self.weights, sample_inputs, sample_labels)
        
        
    def compute_inference(self, input_str):
        last_activation = 0
        results = []
        
        for j in range(len(input_str)):
            start_idx, end_idx = self._get_index_ranges(j)
            pre_padding, post_padding = self._get_padding(start_idx, end_idx, len(input_str))
            vector = self._string_vectorizer(input_str[max(0, start_idx):end_idx])
            
            last_activation = SimpleRecurrentModel._raw_inference(
                self.weights,
                np.concatenate((pre_padding, vector, post_padding, [last_activation, 1]))
            )
            results.append(last_activation)
            
        return results
    
    
    def _string_vectorizer(self, string):
        vector = [
            [0 if char != letter else 1 for char in self.alphabet] 
            for letter in string
        ]
        return np.array(vector).flatten()

    
    def _get_index_ranges(self, inference_index):
        return inference_index - 1, inference_index + self.window_size - 1
    
    
    def _get_padding(self, start_idx, end_idx, len_input_str):
        pre_padding = np.zeros(-min(0, start_idx) * len(self.alphabet))
        post_padding = np.zeros(max(0, end_idx - len_input_str) * len(self.alphabet))
        return pre_padding, post_padding
    
    
    @staticmethod
    def _raw_inference(params, inputs):
        output = np.matmul(inputs, params)
        return np.exp(output) / (1 + np.exp(output))

In [206]:
m = SimpleRecurrentModel(num_chars, window_size, alphabet)

# Data Assembly

In [93]:
inputs = [
    '[REQ] ($1000 over 2 people?) - (#Gilbert, AZ, USA), (Monthly payments of 125), (PayPal or present post dated checks beginning July 2) - Repay $1250/loan',
    '[REQ] (£20) - (#Essex, UK), (£26 on 16th july 2017), (Paypal)',
    '[REQ] (1000.00) - (#Saco, Maine, USA), (1250.00 by 9/30/17), (Paypal)',
    '[REQ] (3,000) - (#Fort Dodge, Iowa, U.S.), (03/01/18), (PayPal)',
    '[REQ] (£20) (MCR uk) , (payback £21 next wednesday) (Paypal)',
    '[REQ] ($30) - (#New Orleans, Louisiana, United States), (Later tonight or tomorrow), (Bitcoin)',
    '[REQ] (40 cad to get home!) - (Ontario, Canada), (50 cad on July 7th), (Bank Transfer)',
    '[REQ] ($25) #kyle, Texas, USA (pay back 6/30/17) (square cash)'
]
labels =[
    [[8, 12], [73, 76], [143, 147]],
    [[8, 10], [30, 32]],
    [[7, 14], [40, 47]],
    [[7, 12]],
    [[8, 10], [33, 35]],
    [[8, 10]],
    [[7, 9], [50, 52]],
    [[8, 10]]
]

In [194]:
def assemble_data(model, inputs, labels, output_markers=False):
    assert len(inputs) == len(labels)
    
    formatted_inputs = []
    formatted_labels = []
    
    for i in range(len(inputs)):
        last_label = False
        for j in range(len(inputs[i])):
            start_idx, end_idx = model._get_index_ranges(j)
            pre_padding, post_padding = model._get_padding(start_idx, end_idx, len(inputs[i]))
            vector = model._string_vectorizer(inputs[i][max(0, start_idx):end_idx])
            
            formatted_inputs.append(np.concatenate((pre_padding, vector, post_padding, [last_label, 1])))
            last_label = any([j >= interval[0] and j < interval[1] for interval in labels[i]])
            formatted_labels.append(int(last_label))
            
            if output_markers:
                print(inputs[i][max(0, start_idx):end_idx])
                print(int(last_label))
            
    return np.array(formatted_inputs), np.array(formatted_labels)

In [161]:
for i in range(len(inputs)):
    for j in range(len(labels[i])):
        print(inputs[i][labels[i][j][0]:labels[i][j][1]])

1000
125
1250
20
26
1000.00
1250.00
3,000
20
21
30
40
50
25


In [195]:
formatted_inputs, formatted_labels = assemble_data(m, inputs, labels)

# Train Model

In [210]:
m.train(formatted_inputs, formatted_labels, 20, 0.01, 5000)

In [211]:
m.weights[[i * len(m.alphabet) + m.alphabet.find('/') for i in range(0, 6)]]

array([ 0.21565328, -0.67261704,  1.53750929,  0.17805479,  0.79739138,
        0.14884647])

In [213]:
s = '[REQ] ($125) - (Toms River, NJ, USA), (06/09/2017 $160), Paypal'
s = '[REQ] (3,000) - (#Fort Dodge, Iowa, U.S.), (03/01/18), (PayPal)'
output = m.compute_inference(s)

for i in range(len(s)):
    if output[i] > 0.1:
        print(s[i])

3
,
0
0
0
0
