In [104]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip


In [105]:
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import keras
from keras import layers

In [106]:
# constants
BASE_PATH = '../input/jigsaw-toxic-comment-classification-challenge/'
TRAIN_PATH = 'train.csv.zip'
TEST_PATH = 'test.csv.zip'
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
NUM_CLASSES = 6

In [107]:
# load train test dataframe
train = pd.read_csv(f"{BASE_PATH}{TRAIN_PATH}")
test = pd.read_csv(f"{BASE_PATH}{TEST_PATH}")

In [108]:
train_text = train['comment_text'].to_list()
train_labels = train[LABELS].values
test_text = test['comment_text'].to_list()

In [109]:
def clean_text(text):
    text = text.lower()
    text = re.sub("'", "", text)
    words = re.split(r'\W+', text)
    text = " ".join(words)
    text = re.sub("\d+", "", text)
    text = " ".join(text.split())
    return text.strip()

In [110]:
clean_train_text = list(map(clean_text, train_text))
clean_test_text = list(map(clean_text, test_text))

In [111]:
# get count vectorizer and tf-idf transformer
def get_vectorizers(texts, max_features):
    """
    :param texts: list of strings
    :param max_features: max number of words in vocab
    :return: tuple of count vectorizer and tf-idf transformer
    """
    count_vectorizer = CountVectorizer(max_features=max_features, max_df=0.5).fit(texts)
    counts = count_vectorizer.transform(texts)
    tfidf_transformer = TfidfTransformer().fit(counts)
    return count_vectorizer, tfidf_transformer

In [112]:
corpus = clean_train_text.copy()
corpus.extend(clean_test_text)
count_vectorizer, tfidf_transformer = get_vectorizers(corpus, 100)

In [113]:
train_data = tfidf_transformer.transform(count_vectorizer.transform(clean_train_text)).toarray()
test_data = tfidf_transformer.transform(count_vectorizer.transform(clean_test_text)).toarray()

In [114]:
train_data.shape

(159571, 100)

In [115]:
test_data.shape

(153164, 100)

In [116]:
# training
inputs = keras.Input(shape=(100,))
dense_1 = layers.Dense(32, activation='relu')(inputs)
dense_1 = layers.BatchNormalization()(dense_1)
dense_1 = layers.Dropout(0.2)(dense_1)

outputs = layers.Dense(NUM_CLASSES, activation='sigmoid')(dense_1)

model = keras.Model(inputs=inputs, outputs=outputs)

In [117]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                3232      
_________________________________________________________________
batch_normalization_1 (Batch (None, 32)                128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 198       
Total params: 3,558
Trainable params: 3,494
Non-trainable params: 64
_________________________________________________________________


In [118]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [124]:
history = model.fit(train_data, train_labels, epochs=20, batch_size=64, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [120]:
test_labels = model.predict(test_data)

In [121]:
ids = test["id"].to_list()
res = []
for idx, label in zip(ids, test_labels):
    res.append([idx, *label])

In [122]:
out_df = pd.DataFrame(res, columns=["id", *LABELS])

In [123]:
out_df.to_csv("out.csv", index=False)