# inisialisasi eksperimen
eksperimen variable:
- learning rate
- batch size
- epoch
- max seq len

In [1]:
# experiment parameters
LEARNING_RATE = 1e-3
EPOCH = 10
MAX_SEQ_LEN = 50
BATCH_SIZE = 1

# Loading Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re

# Importing required libraries
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

# keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from keras.layers import Embedding

from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
from keras.models import Model
from keras import regularizers

# gensim
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

In [3]:
#Colab version
from google.colab import drive
drive.mount('/content/drive')

#Path relatives
path = '/content/drive/My Drive/semester 7/nlp/data_worthcheck/'

KeyboardInterrupt: ignored

In [None]:
from sklearn.preprocessing import LabelEncoder

# defining function to clean text and retrive closs-validation datasets
def cleantxt(txt):
    """
    Cleans the string passed. Cleaning Includes-
    1. remove special characters/symbols
    2. convert text to lower-case
    3. retain only alphabets
    4. remove words less than 3 characters
    5. remove stop-words
    """  
    # collecting english stop words from nltk-library
    stpw = stopwords.words('english')
    
    # Adding custom stop-words
    stpw.extend(['www','http','utc'])
    stpw = set(stpw)
    
    # using regex to clean the text
    txt = re.sub(r"\n", " ", txt)
    txt = re.sub("[\<\[].*?[\>\]]", " ", txt)
    txt = txt.lower()
    txt = re.sub(r"[^a-z ]", " ", txt)
    txt = re.sub(r"\b\w{1,3}\b", " ",txt)
    txt = " ".join([x for x in txt.split() if x not in stpw])
    return txt


def load_data():
    """
    Loads data and returns train, val, and test splits
    """

    le = LabelEncoder()
    ps_train= pd.read_csv(path+'train.csv', index_col=0)
    ps_train['label'] = le.fit_transform(ps_train['label'])

    ps_test= pd.read_csv(path+'test.csv')
    ps_test['label'] = le.fit_transform(ps_test['label'])
    
    data_train = ps_train.dropna(subset=['text_a'])
    data_train_single = data_train.drop_duplicates('text_a')

    
    X_train, X_val, y_train, y_val = train_test_split(data_train_single['text_a'], data_train_single['label'], test_size=0.4, random_state=123)
    X_test = ps_test['text_a']
    y_test = ps_test['label']
    
    return X_train, X_val, X_test, y_train, y_val, y_test


# Implementation of CNN and RNN using word embeddings using word2vec and GloVe for Multi-Label text classification


## 1. Loading Data

In [None]:
# Load the data
X_train, X_val, X_test, y_train, y_val, y_test = load_data()

### 2. Tokenize text of the training data with keras text preprocessing functions ###

In [None]:
# Set Maximum number of words to be embedded
NUM_WORDS = 30000

# Define/Load Tokenize text function
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)

# Fit the function on the text
tokenizer.fit_on_texts(X_train)

# Count number of unique tokens
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
# Convert train and val to sequence
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_valid=tokenizer.texts_to_sequences(X_val)

In [None]:
# Limit size of train/val to max_seq_len and pad the sequence
X_train = pad_sequences(sequences_train,maxlen=MAX_SEQ_LEN)
X_val = pad_sequences(sequences_valid,maxlen=X_train.shape[1])

# Convert target to array
y_train = np.asarray(y_train)
y_val = np.asarray(y_val)

# Printing shape
print('Shape of X train and X validation tensor:', X_train.shape,X_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

# word embedding

## Use pretrained GloVe model from Stanford  <http://nlp.stanford.edu/data/glove.6B.zip>
## Contains 300-dimensional vectors for 0.4 million words and phrases 

In [None]:
word_vectors = dict()

# load the whole embedding into memory
# f = open(path+'glove.6B/glove.6B.300d.txt', encoding="utf8")
f = open(path+'glove.6B/glove.6B.50d.txt', encoding="utf8") # versi kecil dulu, kasian internet saya wkwk
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_vectors[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(word_vectors))

In [None]:
EMBEDDING_DIM=50
vocabulary_size=min(len(word_index)+1,(NUM_WORDS))

embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        vec = np.zeros(EMBEDDING_DIM)
        embedding_matrix[i]=vec

del(word_vectors)

In [None]:
# Dfine Embedding function using the embedding_matrix
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

del(embedding_matrix)

# Build network and train it with specific parameter



In [None]:
sequence_length = X_train.shape[1]
filter_sizes = [3,4]
num_filters = 100
drop = 0.4

inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)
conv_1 = Conv2D(num_filters, (filter_sizes[1], EMBEDDING_DIM),activation='relu',kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D((sequence_length - filter_sizes[0] + 1, 1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D((sequence_length - filter_sizes[1] + 1, 1), strides=(1,1))(conv_1)

merged_tensor = concatenate([maxpool_0, maxpool_1], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((2*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
conc = Dense(40)(dropout)
output = Dense(units=1, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01))(conc)

# this creates a model that includes
model = Model(inputs, output)

In [None]:
# Compiling Model using optimizer
opt = Adam(lr=LEARNING_RATE)
model.compile(loss='binary_crossentropy',optimizer=opt)

# Fitting Model to the data
# callbacks = [EarlyStopping(monitor='val_loss')]
hist_adam = model.fit(X_train, y_train, batch_size=1000, epochs=EPOCH, verbose=2, validation_data=(X_val, y_val))  # starts training

In [None]:
#plotting Loss
plt.suptitle('Optimizer : Adam', fontsize=10)
plt.ylabel('Loss', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.plot(hist_adam.history['loss'], color='b', label='Training Loss')
plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')

In [None]:
# Compiling Model using optimizer
opt = SGD(lr=LEARNING_RATE, momentum=0.9, decay=0.0001)
model.compile(loss='binary_crossentropy',optimizer=opt)

# Fitting Model to the data
# callbacks = [EarlyStopping(monitor='val_loss')]
hist_sgd = model.fit(X_train, y_train, batch_size=1000, epochs=EPOCH, verbose=2, validation_data=(X_val, y_val),)  # starts training

In [None]:
#plotting Loss
plt.suptitle('Optimizer : SGD', fontsize=10)
plt.ylabel('Loss', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.plot(hist_sgd.history['loss'], color='b', label='Training Loss')
plt.plot(hist_sgd.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')

In [None]:
# Compiling Model using optimizer
opt = Adadelta()
model.compile(loss='binary_crossentropy',optimizer=opt)
# Fitting Model to the data
callbacks = [EarlyStopping(monitor='val_loss')]
hist_adad = model.fit(X_train, y_train, batch_size=400, epochs=EPOCH, verbose=2, validation_data=(X_val, y_val),
         callbacks=callbacks)  # starts training

In [None]:
#plotting Loss
plt.suptitle('Optimizer : Adadelta', fontsize=10)
plt.ylabel('Loss', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.plot(hist_adad.history['loss'], color='b', label='Training Loss')
plt.plot(hist_adad.history['val_loss'], color='r', label='Validation Loss')
plt.legend(loc='upper right')

# Best Model

In [None]:
# Compiling Model using optimizer
opt = Adam(lr=LEARNING_RATE)
model.compile(loss='binary_crossentropy',optimizer=opt)

# Fitting Model to the data
model.fit(X_train, y_train, batch_size=400, epochs=EPOCH, verbose=2, validation_data=(X_val, y_val))  # starts training

In [None]:
model.summary()

# Predict on test data

In [None]:
# convert test to sequence and padding the sequence
sequences_test=tokenizer.texts_to_sequences(X_test)
X_test2 = pad_sequences(sequences_test,maxlen=X_train.shape[1])

In [None]:
from sklearn.metrics import classification_report
# Predict on train, val and test datasets
pred_test = model.predict(X_test2)
pred_test_class = [round(x[0]) for x in pred_test]

print(classification_report(y_test, pred_test_class))