In [11]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
import nltk
import chart_studio
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import plotly.graph_objects as go
import chart_studio.plotly as py
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from chart_studio.plotly import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')
from keras import backend as K

In [19]:
data_location_ori = 'data/ISLANDORA/ISLANDORA.csv'

df = pd.read_csv(data_location_ori) # 원본 데이터

df['text'] = list(df.title + " " + df.description)

refined_data = []
for item in df['text']:
    #1. Remove \r 
    current_desc = item.replace('\r', ' ')    
    #2. Remove URLs
    current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
    #4. Remove hex code
    current_desc = re.sub(r'(\w+)0x\w+', '', current_desc) 
    #5. Change to lower case
    current_desc = current_desc.lower()   
    #6. Tokenize
    #current_desc_tokens = tokenizer(current_desc, add_special_tokens= True)
    #7. Strip trailing punctuation marks
    #current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]
    #8. Join the lists
    #current_data = current_desc_filter
    #current_data = list(filter(None, current_data))
    refined_data.append(current_desc)
df['text'] = refined_data

for x in range(len(df.component)):
    df.component[x] = df.component[x].split(',')[0]

print(df.text.head())

Y = pd.get_dummies(df[set(df.component)])
print(df.component.value_counts())


df = df.reset_index(drop=True)

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 25000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = df['text'].sample(frac=1).reset_index(drop=True)
X = tokenizer.texts_to_sequences(X.values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

0    when a user is created we will create an objec...
1    create a script to install islandora demo obje...
2    port to drupal 6 port the islandora core code ...
3    update the islandora drupal form api code to w...
4    update the islandora drupal menu/path array's ...
Name: text, dtype: object
Islandora Module (Core)     285
Solr Search                 140
XML Form Builder            102
Book Solution Pack           72
Scholar Module               66
                           ... 
Simple Workflow               3
Object Field (PID Field)      2
XACML Policies                1
Sync (Deprecated)             1
Objective Forms               1
Name: component, Length: 66, dtype: int64
Found 7612 unique tokens.
Shape of data tensor: (1428, 250)
(1142, 250) (1142, 66)
(286, 250) (286, 66)


In [13]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.2))
model.add(Dense(37, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(top_k = 5)])
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 250, 100)          2500000   
                                                                 
 spatial_dropout1d_5 (Spatia  (None, 250, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 37)                3737      
                                                                 
Total params: 2,584,137
Trainable params: 2,584,137
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
from tensorflow.keras import activations

In [14]:
def run_model(topK):
    model = Sequential()
    model.add(Embedding(25000, EMBEDDING_DIM, input_length=X.shape[1]))
    
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.5, recurrent_dropout=0.2))
    #model.add(LSTM(100, return_sequences=True))
    model.add(Dense(37, activation='sigmoid'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(top_k = topK)])
    print(model.summary())

    epochs = 10
    batch_size = 80

    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [15]:
df_history = []
for topk in range(10, 16, 5):
    run_model(topk)
    accr = model.evaluate(X_test,Y_test)
    print('Test set\n  Loss: {:0.3f}\n  recall: {:0.3f}'.format(accr[0],accr[1]))

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 250, 100)          2500000   
                                                                 
 spatial_dropout1d_6 (Spatia  (None, 250, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_6 (LSTM)               (None, 100)               80400     
                                                                 
 dense_6 (Dense)             (None, 37)                3737      
                                                                 
Total params: 2,584,137
Trainable params: 2,584,137
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Test set
  Loss: 

In [7]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  recall: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 3.951
  recall: 0.149


In [22]:
df['component'].value_counts().sort_values(ascending=False).iplot(kind = 'bar', yTitle = 'Number of Component', title = '# of Islandora dataset component')

In [35]:
df_history[1]

{'loss': [3.3167531490325928,
  3.166748523712158,
  3.1394078731536865,
  3.059082508087158,
  2.839726448059082,
  2.5945162773132324,
  2.3461005687713623,
  2.126883029937744,
  1.9392240047454834,
  1.765804409980774],
 'recall': [0.5076357126235962,
  0.5295308232307434,
  0.529714822769165,
  0.5459061861038208,
  0.6318307518959045,
  0.7000920176506042,
  0.7538178563117981,
  0.7858325839042664,
  0.8154553771018982,
  0.841766357421875],
 'val_loss': [3.2292885780334473,
  3.209724187850952,
  3.1954212188720703,
  3.1329751014709473,
  3.1253695487976074,
  2.9463164806365967,
  2.9692487716674805,
  2.895751714706421,
  2.859379768371582,
  2.9884543418884277],
 'val_recall': [0.5340909361839294,
  0.524350643157959,
  0.5211039185523987,
  0.5324675440788269,
  0.5389610528945923,
  0.6071428656578064,
  0.5827922224998474,
  0.625,
  0.6396104097366333,
  0.6412337422370911]}

In [33]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  recall: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 2.940
  recall: 0.665


In [None]:
def run_model(topK):
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(37, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(top_k = topK)])
    print(model.summary())

    epochs = 20
    batch_size = 64

    history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    df_history.append(history.history)

In [40]:
filter_length = 300
num_classes = 37

modelCNN = tf.keras.Sequential()
modelCNN.add(Embedding(MAX_NB_WORDS, 20, input_length= MAX_SEQUENCE_LENGTH))
modelCNN.add(Dropout(0.1))
modelCNN.add(tf.keras.layers.Conv1D(filter_length, 3, padding = 'valid', activation = 'relu', strides = 1))
modelCNN.add(tf.keras.layers.GlobalMaxPool1D())
modelCNN.add(Dense(num_classes))
modelCNN.add(tf.keras.layers.Activation('sigmoid'))

modelCNN.compile(optimizer = 'adam', loss  = 'binary_crossentropy', metrics = [tf.keras.metrics.Recall(top_k = 5)])

modelCNN.summary()
modelCNN.fit(X_train, Y_train, epochs=10, batch_size = 64, validation_split=0.1)

accr = modelCNN.evaluate(X_test, Y_test)
print('Test set\n  Loss: {:0.3f}\n  recall: {:0.3f}'.format(accr[0],accr[1]))

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 250, 20)           1000000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 250, 20)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 248, 300)          18300     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 300)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 37)                11137     
_________________________________________________________________
activation_3 (Activation)    (None, 37)                0         
Total params: 1,029,437
Trainable params: 1,029,437
Non-trainable params: 0
___________________________________________

<tensorflow.python.keras.callbacks.History at 0x1df9fce8d48>

Test set
  Loss: 0.074
  recall: 0.786
