In [12]:
# Resources
# https://keras.io/examples/nlp/pretrained_word_embeddings/
# https://nlp.stanford.edu/projects/glove/

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding


In [8]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# Some pre-processing as suggested by https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
# I suspect that the ruby file does not work as intended, hence this approach.
def preprocess(input_df):
    eyes = "[8:=;]"
    nose = "['`\-]?"
    input_df.replace(to_replace="((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?", value="<URL>", regex=True, inplace=True)
    input_df.replace(to_replace="@\w+", value="<USER>", regex=True, inplace=True)
    input_df.replace(to_replace=""+eyes+nose+"[)d]+|[)d]+"+eyes+nose+"", value="<SMILE>", regex=True, inplace=True)
    input_df.replace(to_replace=""+eyes+nose+"[pP]+", value="<LOLFACE>", regex=True, inplace=True) # improved, added [pP] instead of p. Could this have a negative effect?
    input_df.replace(to_replace=""+eyes+nose+"\(+|\)+"+nose+eyes, value="<SADFACE>", regex=True, inplace=True)
    input_df.replace(to_replace=eyes+nose+"[\/|l*]", value="<NEUTRALFACE>", regex=True, inplace=True)
    input_df.replace(to_replace="<3", value="<HEART>", regex=True, inplace=True)
    input_df.replace(to_replace="[-+]?[.\d]*[\d]+[:,.\d]*", value="<NUMBER>", regex=True, inplace=True)
    input_df.replace(to_replace="#", value="", regex=True, inplace=True) 
    input_df.replace(to_replace="\n", value=" ", regex=True, inplace=True)
    return

preprocess(train_df)
preprocess(test_df)

samples = train_df["text"].tolist()
labels = train_df["target"].tolist()
test_samples = test_df["text"].tolist()

In [9]:
# This boolean allow us to train only on a portion of the train set, while using part as a validation set. 
# This is turned off before submission in order to use the most data while training because the set size is small.
use_dev_set = False
if use_dev_set:
    # Shuffle the data
    seed = 1337
    rng = np.random.RandomState(seed)
    rng.shuffle(samples)
    rng = np.random.RandomState(seed)
    rng.shuffle(labels)

    # Extract a training & validation split
    validation_split = 0.2
    num_validation_samples = int(validation_split * len(samples))

    train_samples = samples[:-num_validation_samples]
    val_samples = samples[-num_validation_samples:]
    train_labels = labels[:-num_validation_samples]
    val_labels = labels[-num_validation_samples:]
else:
    train_samples = samples
    val_samples = []
    train_labels = labels
    val_labels = []
    

print("Train set size: " + str(len(train_samples)))
print("Dev set size: " + str(len(val_samples)))

Train set size: 7613
Dev set size: 0


In [10]:
# Get the maximum length of the input text in words
maxlen = len(max(samples, key=lambda x: len(x.split())).split())

print("Longest string contains " + str(maxlen) + " words")

Longest string contains 31 words


In [16]:
# We index the vocabulary found in the dataset. The output will be padded/truncated to exactly output_sequence_length values, resulting
# in a tensor with shape [batch_size, output_sequence_length]

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=maxlen)
text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
vectorizer.adapt(text_ds)

# Get a dictionary mapping words to indices
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [17]:
# Load pre-trained word embeddings (trained on tweets!)
!wget -nc https://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip -n -q glove.twitter.27B.zip

File ‘glove.twitter.27B.zip’ already there; not retrieving.



In [18]:
# The zip offers multiple options (25, 50, 100, 200) choose which word vector dimensions we want.
zip_dim = 25
path_to_glove_file = "/kaggle/working/glove.twitter.27B." + str(zip_dim) + "d.txt"

In [20]:

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1) 
        coefs = np.fromstring(coefs, "f", sep=" ") # Get vector as 1-D array of floats
        embeddings_index[word] = coefs # Store each word vector in a dictionary (word:coefs)

print("Found %s word vectors." % len(embeddings_index))

Found 1193514 word vectors.


In [21]:
# Prepare embedding matrix
num_tokens = len(voc) + 2
embedding_dim = zip_dim
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

# Some examples of misses:
# bioterror prebreak typhoondevastated bestnaijamade bioterrorism soudelor


Converted 11913 words (2649 misses)


In [22]:
# Create the embedding layer with trainable=False
embedding_layer = Embedding(
    num_tokens, # number of different words in the training set + 2
    embedding_dim, # dimension of the word vectors
    embeddings_initializer=keras.initializers.Constant(embedding_matrix), # the matrix we just created!
    trainable=False, # keep embeddings fixed
)

In [23]:
# Build the model
int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.LSTM(units=128, return_sequences=False)(embedded_sequences)
x = layers.Dropout(rate=0.5)(x)
preds = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs=int_sequences_input, outputs=preds)
model.summary()


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 25)          364100    
                                                                 
 lstm (LSTM)                 (None, 128)               78848     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 443,077
Trainable params: 78,977
Non-trainable params: 364,100
_________________________________________________________________


In [24]:
# Train the model

x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
y_train = np.array(train_labels)

if use_dev_set:
    x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
    y_val = np.array(val_labels)


model.compile(
    loss="binary_crossentropy", optimizer="Adam", metrics=['accuracy']
)

epochs = 20
batch_size = 128
if use_dev_set:
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val))
else:
    model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
# Make the end-to-end model and test it on a custom string!
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probability = end_to_end_model.predict(
    [["there is a wildfire in california"]]
)

print(probability)

[[0.7955165]]


In [None]:
# Create submission
sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
output = []
for s in test_samples:
    probability = end_to_end_model.predict([[s]])
    output.append(1 if probability > 0.5 else 0)


In [42]:
sample_submission["target"] = output
sample_submission.head()
sample_submission.to_csv("submission.csv", index=False)