In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from PIL import Image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv
from sklearn.model_selection import train_test_split
import datetime

In [3]:
def parse_data_from_csv(file_path, reshape = (None, None)):

  with open(file_path) as file:
    ### START CODE HERE

    # Use csv.reader, passing in the appropriate delimiter
    # Remember that csv.reader can be iterated and returns one line in each iteration
    labels = []
    reviews = []

    csv_reader = csv.reader(file, delimiter='\n')
    next(csv_reader)
    for row in csv_reader:
        try:
            row = row[0].split(',')
            labels.append(int(row[-1]))
            reviews.append(row[-2])
        except:
            print(f' exception during: {row}')

    labels = np.array(labels, dtype = np.float32)
    reviews = np.array(reviews)

    ### END CODE HERE

    return reviews, labels

In [4]:
def split_dataset(images, labels, train_split, shuffle = True):
    train_images, validation_images, train_labels, validation_labels = train_test_split(images, labels , train_size= train_split, stratify=labels, shuffle=True)
    return train_images, validation_images, train_labels, validation_labels


In [5]:
file_path = './train.csv'
reviews, labels = parse_data_from_csv(file_path)

 exception during: ['61', 'ablaze', '', '"on the outside you\'re ablaze and alive']
 exception during: ['74', 'ablaze', 'India', '"Man wife get six years jail for setting ablaze niece']
 exception during: ['86', 'ablaze', 'Inang Pamantasan', '"Progressive greetings!']
 exception during: []
 exception during: ['117', 'accident', '', '"mom: \'we didn\'t get home as fast as we wished\' ']
 exception during: ["me: 'why is that?'"]
 exception during: ['149', 'aftershock', '304', '"\'The man who can drive himself further once the effort gets painful is the man who will win.\' ']
 exception during: ['178', 'aftershock', 'United States', '"&gt;&gt; $15 Aftershock : Protect Yourself and Profit in the Next Global Financial... ##book http://t.co/f6ntUc734Z']
 exception during: ['218', 'airplane%20accident', '', '"This is unbelievably insane.']
 exception during: ['232', 'airplane%20accident', 'Havenford', '"+ Nicole Fletcher one of a victim of crashed airplane few times ago. ']
 exception during:

In [6]:
train_x, val_x, train_y, val_y = split_dataset(images=reviews,
                                                labels=labels,
                                                train_split=0.85,
                                                shuffle=True)

In [7]:
numwords = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=numwords, oov_token='<OOV>')
pad = tf.keras.preprocessing.sequence.pad_sequences

tokenizer.fit_on_texts(train_x)

train_seq = tokenizer.texts_to_sequences(train_x)
test_seq = tokenizer.texts_to_sequences(val_x)

In [8]:
mx_len = 0
for i in train_seq:
    mx_len = max(mx_len, len(i))
print(mx_len)

32


In [9]:
train_seq_pad = pad(train_seq, padding='post', truncating='post', maxlen=mx_len)
test_seq_pad = pad(test_seq, padding='post', truncating='post', maxlen=mx_len)

In [15]:
print(f'train_seq_pad: {train_seq_pad.shape}')
print(f'test_seq_pad: {test_seq_pad.shape}')

train_seq_pad: (6463, 32)
test_seq_pad: (1141, 32)


In [16]:
class EarlyStoppingMonitor(tf.keras.callbacks.Callback):
    def __init__(self):
        super(EarlyStoppingMonitor, self).__init__()
        self.current_best = 0
        self.monitor = 'val_acc'

    def on_epoch_end(self, epoch, logs=None):
        # get parameter like below
        # current_train_precision = logs.get('precision')
        # self.model.stop_training = True

        if self.current_best < logs.get(self.monitor):
            self.current_best = logs.get(self.monitor)
            self.model.save(f'best_model_as_{self.monitor}_{self.current_best:.3f}.h5')
        # print('\n\n******* Stopping on Defined Threshold *******')

    def on_train_end(self, logs=None):
        if self.model.stop_training:
            print("\n\n\n****** Early Stopping *******")

In [24]:

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=numwords+1, output_dim=10, input_length=mx_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(8)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

loss = tf.keras.losses.BinaryCrossentropy()
optim = tf.keras.optimizers.Adam(3*1e-5)

model.compile(
    loss = loss,
    optimizer=optim,
    metrics=['acc']
)

# LR = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10**(epoch / 20), verbose=1)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.compile(
    loss = loss,
    optimizer=optim,
    metrics=['acc']
)

In [25]:
history = model.fit(
    train_seq_pad,
    train_y,
    epochs=30,
    batch_size=16,
    validation_data=(test_seq_pad, val_y),
    callbacks= [tensorboard_callback]
)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [14]:
import matplotlib.pyplot as plt
m2 = tf.keras.models.load_model('./best_model_as_val_acc_0.772.h5')


OSError: No file or directory found at ./best_model_as_val_acc_0.772.h5

In [99]:
y = m2.predict(
    test_seq_pad
)



In [100]:
p = y > 0.5

In [101]:
q = p.astype(int)

In [102]:
from sklearn.metrics import  accuracy_score
accuracy_score(q, val_y)

0.7721297107800176