In [20]:
import tensorflow as tf
#import cv2
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing, cross_validation
import numpy as np



In [35]:
data_path = '../../data/card-detection/features/features.csv'
print("TensorFlow version: {}".format(tf.VERSION))

COLUMN_NAMES = ['data','hasFace','image_contour', 'class']

CLASSES = ['driving license', 'financial card', 'text']

NUM_CLASSES = len(CLASSES)
MAX_VOCABULARY_SIZE = 1000
BATCH_SIZE = 10
EPOCHS = 100
LEARNING_RATE = 0.001


def raw_data_fn(y_name='class'):
    dataset = pd.read_csv(data_path, encoding='utf-8')
    
    #drop all df with value NaN
    dataset = dataset.dropna(subset=['data', 'hasFace', 'image_contour', 'class'])
    
    #count total data
    #print("Count for data: %s" % len(dataset))
    
     #append name and description
    description = dataset['data'].astype(str)
    #print(description)
    
    #get labels 
    classes = dataset['class']

    # define Tokenizer with Vocab Size
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_VOCABULARY_SIZE)
    tokenizer.fit_on_texts(description)
    
    # define encoder and make labels
    encoder = preprocessing.LabelBinarizer()
    encoder.fit(CLASSES)
    
    #make features and labels
    labels = encoder.transform(classes)
    features = tokenizer.texts_to_matrix(description, mode='tfidf')
       
    return (features, labels, tokenizer)
    
features, labels, tokenizer = raw_data_fn()
#print(features[0])
#print(labels)

def load_data_fn(features, labels, test_size=0.3):
    return cross_validation.train_test_split(features, labels, test_size=test_size, random_state=42)

def model_fn():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(512, activation=tf.nn.relu, input_shape=(MAX_VOCABULARY_SIZE,)))
    model.add(tf.keras.layers.Dense(NUM_CLASSES, activation=tf.nn.softmax))

    # Create a TensorFlow optimizer, rather than using the Keras version
    # This is currently necessary when working in eager mode
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001)

    # We will now compile and print out a summary of our model
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    model.summary()
    return model


train_features, test_features, train_labels, test_labels = load_data_fn(features, labels)
print("Train count {features/labels}: %s/%s" % (len(train_features), len(train_labels)))
print("Test count {features/labels}: %s/%s" % (len(test_features), len(test_labels)))

#create a model for training
model = model_fn()

#train the training set with model
model.fit(train_features, train_labels, epochs=EPOCHS)

#evaluate the model on test set
loss, accuracy = model.evaluate(test_features, test_labels)
print("Loss : %s   Accuracy: %s" % (loss, accuracy))

TensorFlow version: 1.8.0
Count for data: 16
Train count {features/labels}: 11/11
Test count {features/labels}: 5/5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 512)               512512    
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 1539      
Total params: 514,051
Trainable params: 514,051
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Loss : 0.8823240995407104   Accuracy: 0.6000000238418579


In [40]:
# gImg = cv2.imread(info['image_contour']) 
# temp_t = tf.image.convert_image_dtype(
#     gImg,
#     tf.float32
# )

def predict_fn(model, labels, tokenizer, description):
    name_description = description

    feature = tokenizer.texts_to_matrix(np.array([name_description]), mode='tfidf')
    prediction = model.predict(feature)
    
    predicted_label = CLASSES[np.argmax(prediction[0])]
    return predicted_label

In [55]:
description1 = "Text messaging, or texting, is the act of composing and sending electronic messages, typically consisting of alphabetic and numeric characters, between two or more users of mobile phones, tablets, desktops/laptops, or other devices."
description2 = 'LICENCE 1 STANLEY 2 JOE OLIVIA 3 11 03 1976 UNITED KINGDOM 4a  MORGA7531163M9 BURNS CRESCENT EDINBURGH EH1 QGP 9 k l n p q k 7mm rl DVLA INTERNAL USAGE'
description3 = 'random gibberish text which is neither a driving license or a financial card 347437493 290422 BHHDS *()&^'

predicted_label = predict_fn(model, labels, tokenizer, description3)
print("\ndescription: %s \npredicted class: %s" % (description3, predicted_label))


description: random gibberish text which is neither a driving license or a financial card 347437493 290422 BHHDS *()&^ 
predicted class: text
