In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = {
    'You are great': 1,
    'I hate you': 0,
    'I love you': 2,
    'I think it would be better if you did that. Anyway you are great!': 3,
    'You are a dick': 0,
    'You guys are amazing': 2
}

targets = {
    0: 'insult',
    1: 'compliment',
    2: 'love',
    3: 'constructive comment'
}


x = np.array([i for i in data.keys()])
y = np.array([i for i in data.values()])

## GloVe Embeddings

In [None]:
embedding_dict = {}

In [None]:
with open('./glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], 'float32')
        embedding_dict[word] = vector

## Tokenization

In [None]:
vocab_size = 10000
max_length = 300
oov_token = '< OOV >'


tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

tokenizer.fit_on_texts(x)
x = tokenizer.texts_to_sequences(x)
x = pad_sequences(x, maxlen=max_length)

# Embedding matrix
all_embs = np.stack(embedding_dict.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]


word_index = tokenizer.word_index

embedding_matrix = np.random.normal(emb_mean, emb_std, (vocab_size, embed_size))

for word, i in word_index.items():
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Model

In [None]:
embedding_dim = 16

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_matrix.shape[1], weights=[embedding_matrix]),
    tf.keras.layers.Dense(20, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
model.fit(x, y, epochs=10)