In [22]:
import re
import numpy as np
import pandas as pd
import keras
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, BatchNormalization, LSTM
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec

word_lemmatizer = WordNetLemmatizer()
eng_stop = set(stopwords.words('english'))

In [23]:
print(tf.test.gpu_device_name())

/device:GPU:0


In [24]:
# constants
BASE_PATH = '../input/jigsaw-toxic-comment-classification-challenge/'
TRAIN_PATH = 'train.csv.zip'
TEST_PATH = 'test.csv.zip'
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
NUM_CLASSES = 6
MAX_WORDS=10000

In [25]:
# load train test dataframe
train = pd.read_csv(f"{BASE_PATH}{TRAIN_PATH}")
test = pd.read_csv(f"{BASE_PATH}{TEST_PATH}")

In [26]:
train_text = train['comment_text'].to_list()
train_labels = train[LABELS].values
test_text = test['comment_text'].to_list()

In [27]:
def clean_text(text):
    text = text.lower()
    text = re.sub("'", "", text)
    words = re.split(r'\W+', text)
    text = " ".join(words)
    text = re.sub("\d+", "", text)
    text = " ".join(text.split())
    return text.strip()

In [28]:
clean_train_text = list(map(clean_text, train_text))
clean_test_text = list(map(clean_text, test_text))

In [29]:
def remove_stopwords(text):
    words = [word for word in text.split() if word not in eng_stop]
    return " ".join(words)

In [30]:
def lemmatize(text):
    words = text.split()
    lemmatized_words = list(map(word_lemmatizer.lemmatize, words))
    return " ".join(lemmatized_words)

In [31]:
def tokenizer(texts, max_words):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    return tokenizer

In [32]:
clean_train_text = list(map(remove_stopwords, clean_train_text))
clean_train_text = list(map(lemmatize, clean_train_text))

clean_test_text = list(map(remove_stopwords, clean_test_text))
clean_test_text = list(map(lemmatize, clean_test_text))

In [34]:
def train_word2vec(corpus):
    sentences = [text.split() for text in corpus]
    word2vec_model = Word2Vec(min_count=1, size=64)
    word2vec_model.build_vocab(sentences)
    word2vec_model.train(sentences, total_examples=word2vec_model.corpus_count, epochs=20)
    return word2vec_model

In [39]:
corpus = [*clean_train_text, *clean_test_text]
#w2v_model = train_word2vec(corpus)

In [40]:

tokenizer = tokenizer(corpus, MAX_WORDS)

In [41]:
train_seq = tokenizer.texts_to_sequences(clean_train_text)
test_seq = tokenizer.texts_to_sequences(clean_test_text)

In [42]:
def find_max_len(sequences):
    lengths = list(map(len, sequences))
    return max(lengths)

In [43]:
max_len = find_max_len([*train_seq, *test_seq])
train_seq = pad_sequences(train_seq, maxlen=max_len, padding="post")
test_seq = pad_sequences(test_seq, maxlen=max_len, padding="post")

In [116]:
word_index = tokenizer.word_index
embedding_matrix = np.zeros((MAX_WORDS, 64))
for word, i in word_index.items():
    if i < MAX_WORDS:
        if word not in w2v_model:
            continue
        embedding = w2v_model[word]
        embedding_matrix[i] = embedding

idx=0
for row in embedding_matrix:
    norm = np.linalg.norm(row)
    if norm !=0:
        embedding_matrix[idx]/=norm
    idx+=1

  """
  import sys


In [117]:
model = Sequential()
model.add(Embedding(MAX_WORDS, 64, input_length=max_len))
#model.add(BatchNormalization())
#model.add(LSTM(64, dropout=0.1, return_sequences=True, activation='relu'))
model.add(LSTM(32, dropout=0.1, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(6, activation='sigmoid'))

In [118]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 1426, 64)          640000    
_________________________________________________________________
lstm_7 (LSTM)                (None, 32)                12416     
_________________________________________________________________
batch_normalization_3 (Batch (None, 32)                128       
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 198       
Total params: 652,742
Trainable params: 12,678
Non-trainable params: 640,064
_________________________________________________________________


In [119]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [123]:
with tf.device('/gpu:0'):    
    model.fit(train_seq, train_labels, epochs=1, batch_size=64, validation_split=0.2)



In [127]:
test_labels = model.predict(test_seq)

In [126]:
#model.save('word2vec-model-1')

In [131]:
ids = test["id"].to_list()
res = []
for idx, label in zip(ids, test_labels):
    res.append([idx, *label])

In [132]:
out = pd.DataFrame(res, columns=["id", *LABELS])

In [133]:
out.to_csv("out.csv", index=False)