In [None]:
import tensorflow as tf
from matplotlib import pyplot as plt
import pickle
import random
import math

In [None]:
tf.version.VERSION

In [None]:
tf.config.list_physical_devices('GPU')

### Gather our data

Our dataset is a list of strings, labelled by language. We're using the textual value of translated [HealthLinkBC Files](https://www.healthlinkbc.ca/services-and-resources/healthlinkbc-files) for this purpose, and distinguishing between 8 different languages, including English.

In [None]:
with open("dataset_numeric.pickle", "rb") as f:
    X_text, y, labels = pickle.load(f)

### Look at the data

In [None]:
print(X_text[random.randint(0, len(X_text) - 1)])

### Shuffle the data

Since the data is separated into two different arrays (text and labels) we first shuffle a list of indices, then apply that list of indices to the data using tensorflow's `gather()` function. `gather()` takes a list 'a' and a list of indices 'b', and arranges 'a' according to the ordering of the indices. For example, `tf.gather(['a', 'b', 'c'], [2, 1, 0])` will return `['c', 'b', 'a']`, since 'c' is at the 2 index, 'b' is at the 1 index, and 'a' is at the 0 index.

In [None]:
shuffled_indices = tf.random.shuffle(tf.range(0, len(X_text)))
X = tf.gather(X_text, shuffled_indices)
X_text = tf.gather(X_text, shuffled_indices)
y = tf.gather(y, shuffled_indices)

### Define a function that converts a string into a byte distribution matrix

We use a whole bunch of tensorflow's built in functions to come up with a glorified hashtable of byte counts. Instead of a hash table, though, we return a 16x16 matrix (see `tf.reshape`) because that's easier to visualize.

In [None]:
@tf.function
def string_to_char_distribution(s):
    return tf.reshape(
        tf.nn.softmax(
            tf.math.log(
                tf.cast(
                    tf.histogram_fixed_width(
                        tf.cast(
                            tf.io.decode_raw(
                                tf.strings.regex_replace(
                                    s,
                                    "\s+",
                                    " "
                                ),
                                out_type=tf.uint8
                            ),
                            tf.int32
                        ),
                        [0, 256],
                        nbins=256
                    ),
                    tf.float32
                ) + 1.
            )
        ),
        [16, 16]
    )

In [None]:
string_to_char_distribution("hello world").shape

### Convert all of our data (strings) into these byte distributions

In [None]:
X = tf.map_fn(string_to_char_distribution, X_text, dtype=tf.float32)

### Create a couple of helper functions to aid visualization

In [None]:
def visualize_char_distribution_from_string(s):
    dist = string_to_char_distribution(s)
    plt.imshow(dist)
    
def visualize_char_distribution(dist):
    plt.imshow(dist)

### Visualize a random byte distribution matrix, and output what language it represents.

Try running this cell repeatedly.

In [None]:
random_index = random.randint(0, len(X) - 1)
print(labels[y[random_index]])
visualize_char_distribution(X[random_index])

### Organize our input data into separate buckets for each language

In [None]:
X_in_buckets = tf.dynamic_partition(X, y, len(labels))

assert(len(X_in_buckets) == len(labels))

### Pick out some data to visualize


In [None]:
visualize_char_distribution(X_in_buckets[labels.index(b"english")][0])

In [None]:
visualize_char_distribution(X_in_buckets[labels.index(b"chinese")][0])

### Create a grid of visualizations with a row for each language in our dataset, and each cell containing a different sample

We use this grid to challenge our assumption that byte distributions vary by language, and that we can visually distinguish them. If we can visually distinguish them ourselves, chances are good that the computer can do so too. We're hoping that cells in the same row look visually similar, and that there are consistent, differentiating markers that separate rows from one another.

In [None]:
f, ax = plt.subplots(len(labels), len(labels), figsize=(30, 30))

for yi in range(0, len(labels)):
    for xi in range(0, len(labels)):
        ax[yi][xi].imshow(tf.reshape(X_in_buckets[yi][xi], [16, 16]))

### Create an untrained model using keras

The code below creates a neural network with 3 "standard" layers (input, output, hidden), plus one dropout layer, which discards 20% of the input data, at random, which goes a long way to helping the model generalize. The output layer is a probability distribution (softmax) of all possible labels. Naturally, the label with the highest probability is taken to be the model's true guess.

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(16, 16)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(labels), activation='softmax')
])

In [None]:
model.summary()

### Make a prediction with the untrained model

We test out the model first by using it on a random item. Try running this cell a few times. After you've trained the model, come back to this.

In [None]:
random_index = random.randint(0, len(X) - 1)

prediction = labels[tf.argmax(model.predict([[X[random_index]]]), axis=1)[0]]
actual = labels[y[random_index]]

print("Actual: {}\nPredicted: {}".format(actual.decode("utf-8"), prediction.decode("utf-8")))

if (actual != prediction):
    print("***!!!WRONG!!!***")

### Split the data into test and training sets

Let's withhold 20% of the dataset and only let the model see it after it's already trained on the other 80%. This helps us ensure that the model is not overfitting.

In [None]:
dividing_line = math.floor(len(X) * 0.8)

X_train = X[0:dividing_line]
y_train = y[0:dividing_line]

X_test = X[dividing_line:]
y_test = y[dividing_line:]                 

### Compile the model and fit it to our data (train it)

We compile this model using a fairly standard set of hyperparameters for classification problems such as this. We pass through the entire dataset (complete one "epoch") 32 times. This doesn't cause as much overfitting as you might think, since the dropout layer discards 20% of the data at random.

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
hist = model.fit(x=X_train, y=y_train, validation_split=0.33, epochs=32, batch_size=10)

In [None]:
plt.plot(
    range(32),
    hist.history['accuracy'], 'k',
    hist.history['val_accuracy'], 'g'
)

plt.show()

In [None]:
model.evaluate(X_test, y_test)