In [None]:
import os
import re
import sys
import nltk
import random
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

In [None]:
#Indonesian SMS Preprocessing
def convertTackyText(text):
    words = word_tokenize(text)
    new_string = ''
    for msg in words:
        new_word = ''
        alpha_flag = False
        digit_flag = False
        for c in msg:
            if c.isalpha():
                alpha_flag = True
            elif c.isdigit():
                digit_flag = True
        
        if alpha_flag and digit_flag:
            msg = msg.lower()
            if msg[-4:] != 'ribu' and msg[-3:] != 'rbu' and msg[-2:] != 'rb':
                for c in msg:
                    if c == '1':
                        c = 'i'
                    elif c == '2':
                        c = 's'
                    elif c == '3':
                        c = 'e'
                    elif c == '4':
                        c = 'a'
                    elif c == '5':
                        c = 's'
                    elif c == '6':
                        c = 'g'
                    elif c == '7':
                        c = 't'
                    elif c == '8':
                        c = 'b'
                    elif c == '9':
                        c = 'g'
                    elif c == '0':
                        c = 'o'
                    new_word = new_word + c
        
        if new_word != '':
            new_string = new_string + new_word + ' '
        else:
            new_string = new_string + msg + ' '

    return new_string

def preproccess_text(text_messages):
    # change words to lower case
    processed = text_messages.lower()

    # Replace email addresses with 'emailaddress'
    processed = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', ' emailaddress ', processed)
        
    # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = re.sub(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' phonenumber ', processed)

    # Replace URLs with 'webaddress'
    processed = re.sub(r'[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', ' webaddress ', processed)
    processed = processed.replace('http', '')
    processed = processed.replace('https', '')
    
    # Replace money symbols with 'moneysymbol' (£ can by typed with ALT key + 156)
    processed = re.sub(r'£|\$', 'moneysymbol ', processed)
    processed = processed.replace(' rp.', ' moneysymbol ')
    processed = processed.replace(' rp', ' moneysymbol ')
        
    # Replace numbers with 'number'
    processed = re.sub(r'\d+(\.\d+)?', ' number ', processed)

    # Remove punctuation
    processed = re.sub(r'[.,\/#!%\^&\*;:+{}=\-_`~()?]', ' ', processed)

    # Replace whitespace between terms with a single space
    processed = re.sub(r'\s+', ' ', processed)

    # Remove leading and trailing whitespace
    processed = re.sub(r'^\s+|\s+?$', '', processed)
    return processed

def preproccess_df(text_messages):
    # change words to lower case
    processed = text_messages.str.lower()

    # Replace email addresses with 'emailaddress'
    processed = processed.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', ' emailaddress ')
        
    # Replace phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = processed.str.replace(r'(\()?(\+62|62|0)(\d{2,3})?\)?[ .-]?\d{2,4}[ .-]?\d{2,4}[ .-]?\d{2,4}', ' phonenumber' )

    # Replace URLs with 'webaddress'
    processed = processed.str.replace(r'[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', ' webaddress ')
    processed = processed.str.replace('http', '')
    processed = processed.str.replace('https', '')
    
    # Replace money symbols with 'moneysymbol' (£ can by typed with ALT key + 156)
    processed = processed.str.replace(r'£|\$', ' moneysymbol ')
    processed = processed.str.replace(' rp.', ' moneysymbol ')
    processed = processed.str.replace(' rp', ' moneysymbol ')
        
    # Replace numbers with 'number'
    processed = processed.str.replace(r'\d+(\.\d+)?', ' number ')

    # Remove punctuation
    processed = processed.str.replace(r'[.,\/#!%\^&\*;:{}=\-_`~()?]', ' ')

    # Replace whitespace between terms with a single space
    processed = processed.str.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    processed = processed.str.replace(r'^\s+|\s+?$', '')
    
    return processed

In [235]:
word_features_f = open("word_features.pickle", "rb")
word_features = pickle.load(word_features_f)
word_features_f.close()
print("Created bag of word features!")

ordenc = OrdinalEncoder()
encoded_word_features = encoder.fit_transform(word_features)
for i in range(0, len(encoded_word_features)):
    encoded_word_features[i]+=4

# Filler Grammar to mark Start of a sentence and to add empty fillers to make equal shapes of data
word_features.append("<PAD>")
word_features.append("<START>")
word_features.append("<UNK>")
word_features.append("<UNUSED>")

encoded_word_features=np.append(encoded_word_features, [0,1,2,3])

print(word_features)
print(encoded_word_features)

Created bag of word features!


In [325]:
def find_features(message):
    words = word_tokenize(message)
    features = []
    features.append(1)
    for idx, word in enumerate(word_features):
        if word in words:
            features.append(encoded_word_features[idx])

    return features

In [306]:
#Load Dataset

print("Reading data...")
df = pd.read_csv('corpus/sms_corpus/data.txt', engine='python', sep="<%>", header=None)
print("Data loaded")

classes = df[[0]]
sms_data = preproccess_df(df[1])

oneenc = OneHotEncoder(sparse=False)
Y = oneenc.fit_transform(classes)

# Now lets do it for all the messages
messages = list(zip(sms_data, Y))

# Call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

preferred_range=25
for x in featuresets:
    arr_length=len(x[0])
    if arr_length < preferred_range:
        for i in range(arr_length, preferred_range):
            x[0].append(0)
    elif arr_length > preferred_range:
        diff=arr_length-preferred_range
        for i in range(0, diff):
            del x[0][i]

x_data, y_data = zip(*featuresets)
train_input, test_input, train_output, test_output = train_test_split(x_data, y_data, test_size = 0.2)

train_input=np.array(train_input)
test_input=np.array(test_input)
train_output=np.array(train_output)
test_output=np.array(test_output)

print(test_input)
print(train_output)

Reading data...
Data loaded
[[   1   17  556 ...    0    0    0]
 [   1   17  333 ...    0    0    0]
 [   1  570 1494 ...    0    0    0]
 ...
 [ 333 1126  332 ...  417  960  230]
 [   1  333  687 ...    0    0    0]
 [   1  795  647 ...    0    0    0]]
[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 ...
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [322]:
vocab_size=2000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(3, activation="softmax"))

model.summary()

Model: "sequential_32"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, None, 16)          32000     
_________________________________________________________________
global_average_pooling1d_32  (None, 16)                0         
_________________________________________________________________
dense_63 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_64 (Dense)             (None, 3)                 51        
Total params: 32,323
Trainable params: 32,323
Non-trainable params: 0
_________________________________________________________________


In [323]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

In [327]:
history = model.fit(train_input,
                   train_output,
                   epochs=40,
                   batch_size=50,
                   validation_data=(test_input, test_output),
                   verbose=1)
model.save('sms_classifier_tf_model.h5')

Train on 600 samples, validate on 150 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
