We are taking this notebook from: 

https://www.kaggle.com/code/tanulsingh077/deep-learning-for-nlp-zero-to-transformers-bert

In [1]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

In [2]:
from tensorflow.python.client import device_lib

def get_available_devices():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU' or x.device_type == 'CPU']

In [3]:
get_available_devices()

['/device:CPU:0', '/device:GPU:0']

In [4]:
from tensorflow import keras as keras
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [6]:
train = pd.read_csv("C:\\Users\\anike\\Documents\\ML Data\\Kaggle_Language_Classifier\\jigsaw-toxic-comment-train.csv")
validation = pd.read_csv("C:\\Users\\anike\\Documents\\ML Data\\Kaggle_Language_Classifier\\validation.csv")
test = pd.read_csv("C:\\Users\\anike\\Documents\\ML Data\\Kaggle_Language_Classifier\\test.csv")

In [7]:
train.drop(['severe_toxic','obscene','threat','insult','identity_hate'],axis=1,inplace=True)

In [8]:
train = train.loc[:12000,:]
train.shape

(12001, 3)

In [9]:
train.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


In [10]:
train['comment_text'].apply(lambda x:len(str(x).split())).max()

1403

In [11]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [12]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.comment_text.values, train.toxic.values, 
                                                  stratify=train.toxic.values, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

## RNNs

In [13]:
# using keras tokenizer here
token = keras.preprocessing.text.Tokenizer(num_words=None)
max_len = 1500

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = keras.preprocessing.sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = keras.preprocessing.sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [14]:
%%time
with strategy.scope():
    # A simpleRNN without any pretrained embeddings and one dense layer
    model = keras.Sequential()
    model.add(keras.layers.Embedding(len(word_index) + 1,
                     300,
                     input_length=max_len))
    model.add(keras.layers.SimpleRNN(100))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1500, 300)         13049100  
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               40100     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 13,089,301
Trainable params: 13,089,301
Non-trainable params: 0
_________________________________________________________________
CPU times: total: 31.2 ms
Wall time: 277 ms


In [15]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11b8da2f070>

In [16]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.85%


In [17]:
scores_model = []
scores_model.append({'Model': 'SimpleRNN','AUC_Score': roc_auc(scores,yvalid)})

In [18]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open(r'C:\Users\anike\Documents\ML Data\glove.6B.50d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

400000it [00:05, 78334.24it/s]

Found 400000 word vectors.





## LSTM

In [19]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 50))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 43496/43496 [00:00<00:00, 836407.11it/s]


In [20]:
%%time
with strategy.scope():
    
    # A simple LSTM with glove embeddings and one dense layer
    model = keras.Sequential()
    model.add(keras.layers.Embedding(len(word_index) + 1,
                     50,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

    model.add(keras.layers.LSTM(100))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])
    
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1500, 50)          2174850   
                                                                 
 lstm (LSTM)                 (None, 100)               60400     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 2,235,351
Trainable params: 60,501
Non-trainable params: 2,174,850
_________________________________________________________________
CPU times: total: 0 ns
Wall time: 190 ms


In [21]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11c5c8bb940>

In [22]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.95%


In [23]:
scores_model.append({'Model': 'LSTM','AUC_Score': roc_auc(scores,yvalid)})

## GRU

In [24]:
%%time
with strategy.scope():
    # GRU with glove embeddings and two dense layers
     model = keras.Sequential()
     model.add(keras.layers.Embedding(len(word_index) + 1,
                     50,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
     model.add(keras.layers.SpatialDropout1D(0.3))
     model.add(keras.layers.GRU(300))
     model.add(keras.layers.Dense(1, activation='sigmoid'))

     model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])   
    
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1500, 50)          2174850   
                                                                 
 spatial_dropout1d (SpatialD  (None, 1500, 50)         0         
 ropout1D)                                                       
                                                                 
 gru (GRU)                   (None, 300)               316800    
                                                                 
 dense_2 (Dense)             (None, 1)                 301       
                                                                 
Total params: 2,491,951
Trainable params: 317,101
Non-trainable params: 2,174,850
_________________________________________________________________
CPU times: total: 31.2 ms
Wall time: 239 ms


In [26]:
model.fit(xtrain_pad, ytrain, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11b9aba7be0>

In [27]:
scores = model.predict(xvalid_pad)
print("Auc: %.2f%%" % (roc_auc(scores,yvalid)))

Auc: 0.96%


In [28]:
scores_model.append({'Model': 'GRU','AUC_Score': roc_auc(scores,yvalid)})