In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

### Load dataset

In [None]:
all_word_vietnamese = pd.read_csv('all_words.csv', index_col=[0])
all_word_english = pd.read_csv('all_words_english.txt')

In [None]:
all_word_vietnamese.head(5)

Unnamed: 0,Word
0,a
1,A
2,à
3,À
4,ả


In [None]:
all_word_english.head(10)

Unnamed: 0,Word
0,aardvark
1,aardwolf
2,aaron
3,aback
4,abacus
5,abaft
6,abalone
7,abandon
8,abandoned
9,abandonment


In [None]:
all_word_english.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58110 entries, 0 to 58109
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Word    58109 non-null  object
dtypes: object(1)
memory usage: 454.1+ KB


In [None]:
all_word_vietnamese.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35956 entries, 0 to 35955
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Word    35955 non-null  object
dtypes: object(1)
memory usage: 561.8+ KB


In [None]:
all_word_vietnamese['Word'] = all_word_vietnamese['Word'].apply(lambda word: str(word).lower())
all_word_english['Word'] = all_word_english['Word'].apply(lambda word: str(word).lower())

In [None]:
all_word_vietnamese.head(100)

Unnamed: 0,Word
0,a
1,a
2,à
3,à
4,ả
...,...
95,ác là
96,ác liệt
97,ác long
98,ác man


In [None]:
len_df_eng = all_word_english.shape[0]
len_df_vn = all_word_vietnamese.shape[0]
len_df_eng, len_df_vn

(58110, 35956)

### Labeling 
- 0 is vietnamese
- 1 is english

In [None]:
df_label_vn = pd.DataFrame([0 for _ in range(len_df_vn)], columns=['Label'])
df_label_eng = pd.DataFrame([1 for _ in range(len_df_eng)], columns=['Label'])

df_label_vn.shape, df_label_eng.shape

((35956, 1), (58110, 1))

### Concatenate dataset

In [None]:
df_vn = pd.concat([all_word_vietnamese, df_label_vn], axis=1)
df_eng = pd.concat([all_word_english, df_label_eng], axis=1)

df = pd.concat([df_vn, df_eng], axis=0)
df

Unnamed: 0,Word,Label
0,a,0
1,a,0
2,à,0
3,à,0
4,ả,0
...,...,...
58105,zooms,1
58106,zooplankton,1
58107,zoos,1
58108,zulu,1


In [None]:
df['Word']

0                  a
1                  A
2                  à
3                  À
4                  ả
            ...     
58105          zooms
58106    zooplankton
58107           zoos
58108           zulu
58109          zulus
Name: Word, Length: 94066, dtype: object

### Random sample dataset

In [None]:
df = df.sample(frac=1)
df.head(10)

Unnamed: 0,Word,Label
33160,mutable,1
17433,năm xuân phân,0
21668,ghosting,1
54295,unequal,1
7730,cessations,1
34705,opportune,1
30962,masqueraded,1
851,bác lãm cổ kim,0
35251,xe dây,0
29505,limited,1


### Utility

In [None]:
from tensorflow.train import Example, Feature, Features, BytesList, Int64List

def create_example(word, label):
  word_bytes = tf.io.serialize_tensor(word)
  return Example(
      features=Features(
          feature={
              'word': Feature(bytes_list=BytesList(value=[word_bytes.numpy()])), # This for all word save by bytes string
              # 'word': Feature(int64_list=Int64List(value=[word.numpy()])), # This for all word save by int
              'label': Feature(int64_list=Int64List(value=[label]))
          }
      )
  )

def parse_example(tfrecord):
  feature_descriptions = {
      'word': tf.io.FixedLenFeature([], tf.string, default_value=''),
      # 'word': tf.io.FixedLenFeature([], tf.int64, default_value=''),
      'label': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
  }
  example = tf.io.parse_single_example(tfrecord, feature_descriptions)

  # Use this is word is a scalar
  # return example['word'], example['label']
  # Use this if word is a list of number
  word = tf.io.parse_tensor(example['word'], out_type=tf.int32) # parse bytes to int32
  return word, example['label']


def write_tfrecords(dataset, filename):
  with tf.io.TFRecordWriter(f'{filename}.tfrecord') as f:
    for word, label in dataset:
      example = create_example(word, label)
      example_string = example.SerializeToString()
      f.write(example_string)

def read_tfrecord(filepaths, shuffle_buffer_size=None, n_parse_threads=5,
                  n_read_threads=5, batch_size=None, cache=None):
  dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=n_read_threads)
  dataset = dataset.map(parse_example)
  if cache:
    dataset = dataset.cache()
  if shuffle_buffer_size:
    dataset = dataset.shuffle(shuffle_buffer_size)
  if batch_size:
    dataset = dataset.batch(batch_size)
  return dataset 

In [None]:
dataset = df['Word'].values
labelset = df['Label'].values

dataset.shape, labelset.shape

((94066,), (94066,))

In [None]:
dataset[:10]

array(['moneyed', 'thơm phức', 'xu thế', 'refinement', 'lifethreatening',
       'reinitialised', 'simplex', 'tastier', 'sponge', 'nợ miệng'],
      dtype=object)

### Word Tokenizer

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
for word in dataset:
  tokenizer.fit_on_texts(str(word))

In [None]:
tokenizer.document_count

771033

In [None]:
tokenizer.word_index

In [None]:
import pickle

# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def preprocess_tensor(data, label, tokenizer, to_tensor=True):
  list_word = []
  list_label = []
  for word, label in zip(data, label):
    list_word.append(tokenizer.texts_to_sequences(str(word)))
    list_label.append(label)

  ragged_tensor = tf.ragged.constant(list_word)
  sparse_tensor = ragged_tensor.to_tensor()
  label_tensor = tf.constant(list_label)

  # If use format tensor
  if to_tensor:
    dataset = tf.data.Dataset.from_tensor_slices((sparse_tensor, label_tensor))
    return dataset
  else: # If use format numpy
    dataset = sparse_tensor.numpy()
    dataset = dataset.reshape(len(dataset), -1)
    labelset = label_tensor.numpy().reshape(-1, 1)
    dataset_final = np.c_[dataset, labelset]
    train_size = int(len(dataset) * 0.8)
    test_size = int((len(dataset) - train_size) * 0.5)
    return dataset_final[:train_size], dataset_final[train_size:-test_size], dataset_final[-test_size:]

def convert_sequences_to_texts(sequences, tokenizer):
  # sequences is a tensor
  list_char = tokenizer.sequences_to_texts(sequences.numpy())
  return ''.join(list_char)

def convert_texts_to_sequences(texts, tokenizer):
  return tokenizer.texts_to_sequences(texts)

In [None]:
convert_texts_to_sequences('Việt Nam', tokenizer)

[[20], [3], [37], [4], [8], [1], [6], [16]]

In [None]:
convert_sequences_to_texts(tf.constant([[20], [3], [37], [4], [8], [1], [6], [16]]), tokenizer)

'việt nam'

In [None]:
# train_set, label_train = dataset[:85000], labelset[:85000]
# valid_set, label_valid = dataset[85000:90000], labelset[85000:90000]
# test_set, label_test = dataset[90000:], labelset[90000:]

## Use if data format numpy

In [None]:
# train_set = preprocess_tensor(train_set, label_train, tokenizer, to_tensor=False)
# valid_set = preprocess_tensor(valid_set, label_valid, tokenizer, to_tensor=False)
# test_set = preprocess_tensor(test_set, label_test, tokenizer, to_tensor=False)
train_set, valid_set, test_set = preprocess_tensor(dataset, labelset, tokenizer=tokenizer, to_tensor=False)

In [None]:
train_set[:2]

array([[11,  9, 46, 16,  8, 11,  9, 41, 16,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [16,  3,  1,  3, 16,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [None]:
train_set.shape, valid_set.shape, test_set.shape

((75252, 49), (9407, 49), (9407, 49))

In [None]:
train_set, label_train = train_set[:, :-1], train_set[:, -1:]
valid_set, label_valid = valid_set[:, :-1], valid_set[:, -1:]
test_set, label_test = test_set[:, :-1], test_set[:, -1:]

In [None]:
train_set.shape, valid_set.shape, test_set.shape

((75252, 48), (9407, 48), (9407, 48))

### Use logistic regression

In [None]:
# Use logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=100000)
clf.fit(train_set, label_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(max_iter=100000)

In [None]:
y_pred = clf.predict(valid_set)

from sklearn.metrics import accuracy_score
accuracy_score(label_valid, y_pred)

0.8939087913256086

In [None]:
accuracy_score(label_test, clf.predict(test_set))

0.8945466142234506

In [None]:
from sklearn.metrics import confusion_matrix

conf = confusion_matrix(clf.predict(train_set), label_train)
conf

array([[23017,  2172],
       [ 5633, 44430]])

In [None]:
input = 'xin'
arr = tokenizer.texts_to_sequences(input)
arr_pad = np.pad(arr, ((0, train_set.shape[1] - len(arr)), (0, 0)), constant_values=0) # Top, bottom, left, right

[[28], [3], [1]]


In [None]:
clf.predict(arr_pad.reshape(1, -1))

array([1], dtype=int32)

In [None]:
np.unique(label_train, return_counts=True)

(array([0, 1], dtype=int32), array([28863, 46389]))

### Use neural network

In [None]:
# Learning rate schedular
def exponential_decay(lr0, s):
  def exponential_decay_fn(epoch):
    return lr0 * 0.1 ** (epoch / s)
  return exponential_decay_fn

exponential_decay_fn = exponential_decay(lr0=0.01, s=100)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(10, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9), 
              metrics=['accuracy'])
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)

In [None]:
earlystopping = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)

In [None]:
model.fit(train_set, label_train, epochs=100, validation_data=(valid_set, label_valid),
          callbacks=[earlystopping])

### Evaluate

In [None]:
model.evaluate(test_set, label_test)



[0.15200401842594147, 0.9553524255752563]

### Save model

In [None]:
model.save('model_recognition_language.h5')

### Preproces input and predict

In [None]:
# Predict 
input = 'dịch vụ'
arr = tokenizer.texts_to_sequences(input)

# Padding zero
arr_pad = np.pad(arr, ((0, train_set.shape[1] - len(arr)), (0, 0)), constant_values=0).reshape(1, -1) # Top, bottom, left, right
arr_pad

array([[15, 47, 11,  9,  8, 20, 53,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [None]:
model.predict(arr_pad)



array([[5.549451e-08]], dtype=float32)

-------------------------------------------------

## Use if data format tensor

In [None]:
trainset_tensor = preprocess_tensor(train_set, label_train, tokenizer)
validset_tensor = preprocess_tensor(valid_set, label_valid, tokenizer)
testset_tensor = preprocess_tensor(test_set, label_test, tokenizer)

In [None]:
for X, y in trainset_tensor.take(1):
  print(X)
  print(y)

------------------------------------

In [None]:
# train_set, label_train = dataset[:85000], labelset[:85000]
# valid_set, label_valid = dataset[85000:90000], labelset[85000:90000]
# test_set, label_test = dataset[90000:], labelset[90000:]

In [None]:
# trainset_tensor = tf.data.Dataset.from_tensor_slices((train_set, label_train))
# validset_tensor = tf.data.Dataset.from_tensor_slices((valid_set, label_valid))
# testset_tensor = tf.data.Dataset.from_tensor_slices((test_set, label_test))

In [None]:
write_tfrecords(trainset_tensor, 'all_word_trainset')
write_tfrecords(validset_tensor, 'all_word_validset')
write_tfrecords(testset_tensor, 'all_word_testset')

In [None]:
filepaths = [f'/content/all_word_{filename}.tfrecord' for filename in ('trainset', 'validset', 'testset')]

trainset = read_tfrecord(filepaths[0])
validset = read_tfrecord(filepaths[1])
testset = read_tfrecord(filepaths[2])

In [None]:
for X, y in trainset.take(1):
  print(tf.reshape(X, shape=[-1]))
  print(y)

tf.Tensor(
[12 30 16  8  1 13 14 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], shape=(48,), dtype=int32)
tf.Tensor(0, shape=(), dtype=int64)


In [None]:
convert_sequences_to_texts(X, tokenizer)

'lâm nguy'

In [None]:
def reshape_tensor(X, y):
  return (tf.reshape(X, shape=[-1]), y)

In [None]:
# trainset = trainset.map(lambda X, y: (tf.reshape(trainset, shape=[-1]), y))
# validset = validset.map(lambda X, y: (tf.reshape(validset, shape=[-1]), y))
# testset = testset.map(lambda X, y: (tf.reshape(testset, shape=[-1]), y))

trainset = trainset.map(reshape_tensor)
validset = validset.map(reshape_tensor)
testset = validset.map(reshape_tensor)

In [None]:
for X, y in trainset.take(1):
  print(X),
  print(y)

tf.Tensor(
[12 30 16  8  1 13 14 19  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], shape=(48,), dtype=int32)
tf.Tensor(0, shape=(), dtype=int64)


In [None]:
trainset = trainset.batch(32)
testset = testset.batch(32)
validset = validset.batch(32)

In [None]:
for X_batch, y_batch in trainset.take(1):
  print(X_batch, y_batch)

tf.Tensor(
[[12 30 16 ...  0  0  0]
 [ 7  3 20 ...  0  0  0]
 [11 10  1 ...  0  0  0]
 ...
 [ 1  9 46 ...  0  0  0]
 [20  3  6 ...  0  0  0]
 [23  9 10 ...  0  0  0]], shape=(32, 48), dtype=int32) tf.Tensor([0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1], shape=(32,), dtype=int64)


In [None]:
# trainset = trainset.map(lambda X_batch, y_batch: (tf.expand_dims(X_batch, axis=0), y_batch))
# validset = validset.map(lambda X_batch, y_batch: (tf.expand_dims(X_batch, axis=0), y_batch))
# testset = testset.map(lambda X_batch, y_batch: (tf.expand_dims(X_batch, axis=0), y_batch))

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.document_count, 128, input_shape=[None]),
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
              metrics=['accuracy'])

In [None]:
history = model.fit(trainset, epochs=10, validation_data=validset)