In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm

import tensorflow as tf


from tensorflow import keras

from tensorflow.keras.preprocessing import sequence,text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import Embedding, SimpleRNN


D = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'
D_TRANS = '/kaggle/input/jigsaw-train-multilingual-coments-google-api/'

import os


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
def load_jigsaw_trans(langs=['tr','it','es','ru','fr','pt'],
                      columns=['comment_text', 'toxic']):
    train_6langs=[]
    for i in range(len(langs)):
        fn = D_TRANS+'jigsaw-toxic-comment-train-google-%s-cleaned.csv'%langs[i]
        train_6langs.append(downsample(pd.read_csv(fn)[columns]))

    return train_6langs

def downsample(df):
    """Subsample the train dataframe to 50%-50%"""
    ds_df= df.sample(frac=0.5, random_state=42)

    return ds_df

en_train = pd.read_csv(D+'jigsaw-toxic-comment-train.csv')[['comment_text', 'toxic']]
train = pd.concat(load_jigsaw_trans() + [en_train])
validation = pd.read_csv(D+'validation.csv')
test = pd.read_csv(D+'test.csv')


In [None]:
train.head()

In [None]:
test.head()

In [None]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

In [None]:
import sklearn.metrics as metrics
def roc_auc(predictions,target):
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [None]:

xtrain, ytrain = train.comment_text.values, train.toxic.values
xvalid, yvalid = validation.comment_text.values, validation.toxic.values


In [None]:
print(xtrain[0])
print(ytrain[0])

In [None]:

#defining our tokenizer
token=text.Tokenizer(num_words=None)

max_len=1500

#required to fit on text before using texts to sequences
token.fit_on_texts(list(xtrain) + list(xvalid))

xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
len(word_index)

In [None]:
with strategy.scope():
  model = tf.keras.Sequential([
      Embedding(len(word_index) + 1,100,input_length=max_len),
      SimpleRNN(100),
      Dense(1, activation='sigmoid')
  ])

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [None]:
model.fit(xtrain_pad,ytrain,epochs=2)

In [None]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))