In [1]:
# Import libraries
import nltk
import numpy as np
import tensorflow as tf
import pandas as pd
import csv
import string

from sklearn.model_selection import train_test_split      
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
data_X = {
  'ES2002a' : open('ES2002a.transcript.txt', 'r'),
  'ES2002b' : open('ES2002b.transcript.txt', 'r'),
  'ES2002c' : open('ES2002c.transcript.txt', 'r'),
  'ES2002d' : open('ES2002d.transcript.txt', 'r'),
}

data_Y = {
  'ES2002a' : open('ES2002a.extsumm.txt', 'r'),
  'ES2002b' : open('ES2002b.extsumm.txt', 'r'),
  'ES2002c' : open('ES2002c.extsumm.txt', 'r'),
  'ES2002d' : open('ES2002d.extsumm.txt', 'r'),
}

X = []
Y = []

lemmatizer = WordNetLemmatizer() 
sb_stemmer = SnowballStemmer('english')
nltk_stopwords = stopwords.words('english')
filter_word = nltk_stopwords
custom_stopwords = ['hmm', 'um', 'uh-huh', 'okay', 'uh', 'yeah', 'mm-hmm', 'uhm']
filter_word = nltk_stopwords + custom_stopwords + list(string.punctuation)

# Lemmatize the word, 
def stem(sentence):
  tokens = word_tokenize(sentence)
  stemmed_sentence = ""
  for word in tokens:
    if word not in filter_word:
      stemmed_sentence = stemmed_sentence + " " + lemmatizer.lemmatize(word.lower())
      
  
  return stemmed_sentence

# check for each label is present in X
def compare(x_token, y_token):
  y_token_copy = y_token
  count = len(y_token)

  for x in x_token:
    if x in y_token:
      y_token_copy.remove(x)
  
  count_copy = len(y_token_copy)
  if (count - count_copy) >1:
    return True, y_token_copy
  else: 
    return False, y_token

total_tokens = 0
total_tokenized = 0
for document in data_X:
  document_x = data_X[document].read()
  document_y = data_Y[document].read()
  sentence_x = nltk.sent_tokenize(document_x)
  sentence_y = nltk.sent_tokenize(document_y)
  
  print("Document: {}, X: {}, Y: {}".format(document,len(sentence_x),len(sentence_y)))
  y_count = 0
  for index_x,value_x in enumerate(sentence_x):
    x_token = word_tokenize(value_x)
    y_token = word_tokenize(sentence_y[y_count])
    is_decision, y_token = compare(x_token,y_token)
    if is_decision:
      Y.append(1)
    else:
      Y.append(0)
    if not y_token:
      y_count += 1
    X.append(stem(value_x))
    total_tokens += len(word_tokenize(value_x))
    total_tokenized += len(word_tokenize(stem(value_x)))



Document: ES2002a, X: 332, Y: 41
Document: ES2002b, X: 691, Y: 143
Document: ES2002c, X: 633, Y: 94
Document: ES2002d, X: 854, Y: 107


In [None]:
#split into 75% training data and 25% testing data
x_train,x_test,y_train,y_test = train_test_split(X, Y, random_state=1000, stratify= Y, test_size=0.25, )
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_train)

X_train = tokenizer.texts_to_sequences(x_train)
X_test = tokenizer.texts_to_sequences(x_test)

# Adding 1 because of  reserved 0 index
vocab_size = len(tokenizer.word_index) + 1                          

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
y_train = np.array(y_train)
y_test = np.array(y_test)

#embed words into GloVe embedding
def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index) + 1  
  # Adding again 1 because of reserved 0 index
  embedding_matrix = np.zeros((vocab_size, embedding_dim))

  with open(filepath) as f:
    for line in f:
      word, *vector = line.split()
      if word in word_index:
        idx = word_index[word] 
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

  return embedding_matrix

embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' , tokenizer.word_index, embedding_dim)


Train the CNN

In [None]:
from keras.models import Sequential
from keras import layers
import keras.backend as K

def get_f1(y_true, y_pred): #taken from old keras source code
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  recall = true_positives / (possible_positives + K.epsilon())
  f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
  return f1_val

kernel_size = [3,5,20]
num_fil = [4,8,64,128]

for f in num_fil:
  for k in kernel_size:
    f1 = 0
    precision = 0
    recall = 0
    for i in range(10):
      model = Sequential()
      model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
      model.add(layers.Conv1D(f, k, activation='relu'))
      model.add(layers.GlobalMaxPooling1D())
      model.add(layers.Dense(10, activation='relu'))
      model.add(layers.Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics= [tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), get_f1])
      history = model.fit(X_train, y_train, epochs=10, batch_size=10, verbose=0)
      result = model.evaluate(X_test, y_test, verbose=0)
      precision += result[1]
      recall += result[2]
      f1 += result[3]

    print("Number of filter: ", f, ", Kernel size: ", k)
    print("Precision: ",precision/10, "Recall: ", recall/10 ,"F1: ",f1/10)  

#print(model.summary())

Number of filter:  4 , Kernel size:  3
Precision:  0.8082627534866333 Recall:  0.760638302564621 F1:  0.7663241446018219
Number of filter:  4 , Kernel size:  5
Precision:  0.8040926575660705 Recall:  0.758156031370163 F1:  0.7650553762912751
Number of filter:  4 , Kernel size:  20
Precision:  0.8066249251365661 Recall:  0.7585106432437897 F1:  0.7651508748531342
Number of filter:  8 , Kernel size:  3
Precision:  0.8145659565925598 Recall:  0.7499999940395355 F1:  0.7653832733631134
Number of filter:  8 , Kernel size:  5
Precision:  0.8086940050125122 Recall:  0.7482269465923309 F1:  0.7613737165927887
Number of filter:  8 , Kernel size:  20
Precision:  0.8004379391670227 Recall:  0.7641843974590301 F1:  0.7660239279270172
Number of filter:  64 , Kernel size:  3
Precision:  0.8057088673114776 Recall:  0.7496453881263733 F1:  0.7623980522155762
Number of filter:  64 , Kernel size:  5
Precision:  0.7957610845565796 Recall:  0.7695035457611084 F1:  0.7674879789352417
Number of filter:  64 