# Toxic comment classification with Keras

Original notebook: https://www.kaggle.com/sarvajna/keras-sequential-model-lb-0-052

In [None]:
!pip install bentoml
!pip install keras

In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing import text, sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split


In [None]:
list_of_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
max_features = 20000
max_text_length = 400
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
batch_size = 32
epochs = 2

## Download data
Please Download data with Kaggle at https://www.kaggle.com/sarvajna/keras-sequential-model-lb-0-052/data

In [None]:
train_df = pd.read_csv('./train.csv')
print(train_df.head())

In [None]:
x = train_df['comment_text'].values
print(x)

In [None]:
y = train_df[list_of_classes].values
print(y)

In [None]:
x_tokenizer = text.Tokenizer(num_words=max_features)
print(x_tokenizer)
x_tokenizer.fit_on_texts(list(x))
print(x_tokenizer)
x_tokenized = x_tokenizer.texts_to_sequences(x) #list of lists(containing numbers), so basically a list of sequences, not a numpy array
#pad_sequences:transform a list of num_samples sequences (lists of scalars) into a 2D Numpy array of shape 
x_train_val = sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

In [None]:
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y, test_size=0.1, random_state=1)

In [None]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=max_text_length))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto 6 output layers, and squash it with a sigmoid:
model.add(Dense(6))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
validation_data=(x_val, y_val))

In [None]:
test_df = pd.read_csv('./test.csv')

In [None]:
x_test = test_df['comment_text'].values

In [None]:
x_test_tokenized = x_tokenizer.texts_to_sequences(x_test)
x_testing = sequence.pad_sequences(x_test_tokenized, maxlen=max_text_length)

In [None]:
y_testing = model.predict(x_testing, verbose = 1)

In [None]:
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission[list_of_classes] = y_testing
sample_submission.to_csv("toxic_comment_classification.csv", index=False)

In [None]:
type(model)

## Creating Service with BentoML

In [None]:
%%writefile toxic_comment_classifier.py

from bentoml import api, artifacts, env, BentoService
from bentoml.artifact import PickleArtifact
from bentoml.handlers import DataframeHandler

from keras.preprocessing import text, sequence
import numpy as np

list_of_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
max_text_length = 400

@env(conda_pip_dependencies=['keras', 'pandas', 'numpy'])
@artifacts([PickleArtifact('x_tokenizer'), PickleArtifact('model')])
class ToxicCommentClassification(BentoService):
    def tokenize_df(self, df):
        comments = df['comment_text'].values
        tokenized = self.artifacts.x_tokenizer.texts_to_sequences(comments)        
        input_data = sequence.pad_sequences(tokenized, maxlen=max_text_length)
        return input_data
    
    @api(DataframeHandler)
    def predict(self, df):
        input_data = self.tokenize_df(df)
        prediction = self.artifacts.model.predict(input_data)
        result = []
        for i in prediction:
            result.append(list_of_classes[np.argmax(i)])
        return result

In [154]:
from toxic_comment_classifier import ToxicCommentClassification

svc = ToxicCommentClassification.pack(x_tokenizer=x_tokenizer, model=model)

saved_path = svc.save('/tmp/bento_archive')

[2019-07-26 14:58:36,376] INFO - Copying local python module '/Users/bozhaoyu/src/bento_gallery/keras/toxic-comment-classification/toxic_comment_classifier.py'
[2019-07-26 14:58:36,378] INFO - Done copying local python dependant modules
[2019-07-26 14:58:36,468] INFO - BentoService ToxicCommentClassification:2019_07_26_e5011225 saved to /tmp/bento_archive/ToxicCommentClassification/2019_07_26_e5011225


## Use the archived service in other python application

In [None]:
sample_test = test_df.iloc[40:45]

from bentoml import load

bento_service = load(saved_path)
print(bento_service.predict(sample_test))