<a href="https://colab.research.google.com/github/andy311p/Udemy_advanced_NLP/blob/master/NLP_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Installations
!pip install kaggle
!cp drive/My\ Drive/nlp_course/kaggle.json /root/.kaggle/
!git clone https://github.com/lazyprogrammer/machine_learning_examples.git

In [None]:
#Imports and HyperParameters
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

MAX_SEQUENCE_LENGTH = 100
MAX_VOCAB_SIZE = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 10

In [None]:
#load word vectors
print("Loading word vectors START")
word2vec = {}
with open(os.path.join('drive/My Drive/nlp_course/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  for line in f:
    line = line.split()
    word2vec[line[0]] = np.asarray(line[1:],dtype='float32')
print("Loaded %s word vectors" % len(word2vec))


Loading word vectors START
Loaded 17117 word vectors


In [None]:
#Load training data
print("Loading training data START")
train = pd.read_csv("drive/My Drive/nlp_course/train.csv")

#.values converts the result from pandas to numpy array
sentences = train['comment_text'].values
possible_labels = train.columns[2:].values
targets = train[possible_labels].values
#print("max length: ", max(len(s) for s in sentences))

Loading training data START


In [None]:
#Pre process data
#create a tokenizer for the texts and convert them into numbers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences) #tokenized sentences
data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH) #padded tokenized sentences

#get the word->index dictionary
word2idx = tokenizer.word_index

# print(sequences[0])
# print(data[0])

In [None]:
#Prepare embedding matrix
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1) #the actual size of vocabulary. +1 for padding
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))
for w,i in word2idx.items():
  if i< MAX_VOCAB_SIZE:
    if w in word2vec:
      embedding_matrix[i] = word2vec[w]

In [None]:
#Embedding layer
embedding_layer = Embedding(
    num_words,
    EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False
)

#model architecture
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(128,3,activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128,3,activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128,3,activation='relu')(x)
x = GlobalMaxPool1D()(x)
x = Dense(128,activation='relu')(x)
output = Dense(len(possible_labels),activation='sigmoid')(x)

model = Model(input_, output)
model.compile(
    loss='binary_crossentropy',
    optimizer='rmsprop',
    metrics=['accuracy']
)

In [None]:
print("Training time.....")
res = model.fit(
    data,
    targets,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT
)

Training time.....
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f348ec0ac50>

In [None]:
# plot some data
plt.plot(res.history['loss'], label='loss')
plt.plot(res.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

# accuracies
plt.plot(res.history['accuracy'], label='acc')
plt.plot(res.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

# plot the mean AUC over each label
p = model.predict(data)
aucs = []
for j in range(6):
    auc = roc_auc_score(targets[:,j], p[:,j])
    aucs.append(auc)
print(np.mean(aucs))

NameError: ignored

[[[-0.5801525   0.06808588 -0.04721453]
  [ 0.32034228 -0.62727557 -2.44219065]
  [ 0.23404343  1.38661925  0.07614053]]

 [[ 0.83267963  0.19208428  1.02801091]
  [-0.88827598 -1.47248712 -0.57494242]
  [ 0.11928063 -0.08006667 -2.4718639 ]]

 [[-0.04368834 -0.26788151  0.65371636]
  [ 1.62533369  0.80078791  0.06781377]
  [ 1.03668321  0.50337205 -1.04663385]]]
