In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv("train_input_data_pl.csv")
print(df.shape[0])
df = df.dropna()
df['words_count'] = df["text"].apply(lambda text: len(text.split()))

# x_columns = ["text", "words_count"]
# y_columns = ["label", "label_id"]
x_columns = "text"
y_columns = "label_id"
x, y = df[x_columns], df[y_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4, stratify=y)

training = pd.concat([x_train, y_train], axis=1)
testing = pd.concat([x_test, y_test], axis=1)

FileNotFoundError: [Errno 2] No such file or directory: 'train_input_data_pl.csv'

In [None]:
training

In [None]:
testing

# Tokenization

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
training_sentences = x_train.to_list()

LABEL_N = 21

# First approach
# LEARNING_RATE = 0.01
# VOCAB_SIZE = 10_000
# EMBEDDING_DIM = 16
# DENSE_LAYER_COUNT = 128
# max_length = 600
# trunc_type='post'
# padding_type='post'

def pad(sequences):
    return pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Optimized
LEARNING_RATE = 0.005
VOCAB_SIZE = 2_000
EMBEDDING_DIM = 128 # 16
DENSE_LAYER_COUNT = 128
max_length = 400
trunc_type='post'
padding_type='post'

tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)
# print(tokenizer.word_index)

In [None]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad(training_sequences)
# training_padded

In [None]:
testing_sentences = x_test.to_list()
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad(testing_sequences)
# testing_padded

In [None]:
import numpy as np
def create_all_categories_columns(input_array: np.array):
    result = np.zeros(shape=(input_array.shape[0], LABEL_N), dtype=int)
    result[np.arange(0, input_array.shape[0]), input_array] = 1
    return result

In [None]:
training_padded = np.array(training_padded)
training_labels = create_all_categories_columns(np.array(y_train))
testing_padded = np.array(testing_padded)
testing_labels = create_all_categories_columns(np.array(y_test))

In [None]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential

In [None]:
# model = Sequential()
# model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=training_padded.shape[1]))
# model.add(SpatialDropout1D(0.7))
# model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
# model.add(Dense(20, activation='softmax'))
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# print(model.summary())
# training = model.fit(training_padded, training_labels, epochs=1_00, batch_size=256, validation_split=0.2)

In [None]:
model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(DENSE_LAYER_COUNT, activation="relu"),
        tf.keras.layers.Dense(LABEL_N, activation='softmax'),
])
adam = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(loss="categorical_crossentropy", optimizer=adam, metrics=['accuracy'])

model.summary()

In [None]:
num_epochs = 100
history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

In [None]:
def get_model_prediction(model, text):
    text_representation = np.array(
        pad(
            tokenizer.texts_to_sequences(
                [text]
            )
        )
    )

    prediction = list(model.predict(text_representation)[0])
    print(prediction)
    column = prediction.index(max(prediction))
    return column

get_model_prediction(model, "")

In [None]:
training_set = set(training["text"].unique())

for text, label_id in zip(testing["text"], testing["label_id"].to_list()):
    assert text not in training_set
    print(get_model_prediction(model, text) == label_id)