In [6]:
# load raw data.
import os
train_data_path = os.path.join("data", "blurbs_train.txt")
test_data_path = os.path.join("data", "blurbs_test.txt")


In [8]:
from bs4 import BeautifulSoup
parser = BeautifulSoup(open(train_data_path, "r", encoding="utf-8"), "html.parser")
parser.find_all("book")

[<book date="2019-01-04" xml:lang="de">
 <title>Die Klinik</title>
 <body>Ein Blick hinter die Kulissen eines Krankenhauses vom Autor der Bestseller "Der Medicus" und "Der Medicus von Saragossa". Der Wissenschaftler Adam Silverstone, der kubanische Aristokrat Rafael Meomartino und der Farbige Spurgeon Robinson - sie sind drei grundverschiedene Klinik-Ärzte, die unter der unerbittlichen Aufsicht von Dr. Longwood praktizieren. Eines Tages stirbt eine Patientin, und Dr. Longwood wittert einen Behandlungsfehler. Sofort macht er sich auf die Suche nach einem Schuldigen, dem er die Verantwortung in die Schuhe schieben könnte ...</body>
 <copyright>(c) Verlagsgruppe Random House GmbH</copyright>
 <categories>
 <category>
 <topic d="0">Literatur &amp; Unterhaltung</topic>
 <topic d="1" label="True">Romane &amp; Erzählungen</topic>
 </category>
 </categories>
 <authors>Noah Gordon</authors>
 <published>2013-12-02</published>
 <isbn>9783641136291</isbn>
 <url>https://www.randomhouse.de/ebook/Die

In [None]:
#load train or test data
from bs4 import BeautifulSoup

def load_data(path):
    doc = open(path,encoding='utf8').read()
    xmldata = BeautifulSoup(doc, "html.parser")
    book_entries = xmldata.findAll("book")

    data = []
    for book in book_entries:
        parse_xml = BeautifulSoup(str(book), "html.parser")
        blurb = str(parse_xml.find('body').string)
        topcategory = str(parse_xml.find("topic",{"d":"0"}).string)
        data.append((blurb, topcategory))

    return data

#train data
train_data = load_data('blurbs_train.txt')
text_train = [text[0] for text in train_data]
y_train = [text[1] for text in train_data]

#dev/validation data
validation_data = load_data('blurbs_dev.txt')
text_validation = [text[0] for text in validation_data]
y_validation = [text[1] for text in validation_data]


#test data
test_data = load_data('blurbs_test.txt')
text_test = [text[0] for text in test_data]
y_test = [text[1] for text in test_data]

AttributeError: ignored

In [None]:
labels = [y_train, y_validation, y_test]
titles = ['Train Label', 'Validation Label', 'Test Label']

plt.figure(figsize=(20,8))

#plot train label
ax = plt.subplot(1,3,1)
plt.hist(x=y_train, bins=15)
plt.title('Train Label Histogram',fontsize=15)
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=90)

#plot validation label
plt.subplot(1,3,2)
plt.hist(x=y_validation, bins=15)
plt.title('Holdout Label Histogram',fontsize=15)
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=90)

#plot test label
plt.subplot(1,3,3)
plt.hist(x=y_test, bins=15)
plt.title('Test Label Histogram',fontsize=15)
plt.xlabel('Category')
plt.ylabel('Frequency')
plt.xticks(rotation=90)

pickle.dump(ax, open('myplot.pickle', 'wb'))
plt.close()

**Sebaran label dalam data training, validation dan test**

Grafik di bawah menunjukan sebaran target label atau kategori buku dalam dataset

In [None]:
ax = pickle.load(open('myplot.pickle','rb'))
plt.show()

In [None]:
from transformers import BertTokenizer, BertModel
import numpy as np

#Load pre-trained BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case=True)
#Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-german-cased', output_hidden_states=True)
bert_model.eval()


def get_tokenize_data(docs):
    sentence_tokens = []
    
    for text in docs:
        encoded = bert_tokenizer.encode_plus(
            text = text,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_attention_mask=False
        )
        encode_token = np.array(encoded.get('input_ids'))
        sentence_tokens.append(encode_token)
    
    return np.array(sentence_tokens)

def get_context_feature_concat_lastfourlayer(tokens):
    features = []
    for token in sentence_tokens:
        indexed_tokens = bert_tokenizer.convert_tokens_to_ids(token)
        segments_ids = [1] * len(token)
        
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        
        with torch.no_grad():
            outputs = bert_model(tokens_tensor, segments_tensors)
            hidden_states = outputs[2]
        
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        
        token_vecs_cat = []
        
        for token in token_embeddings:
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            token_vecs_cat.append(cat_vec)
            
        features.append(token_vecs_cat)
        
    return np.array(features)

def get_context_feature_concat_sumfourlayer(sentence_tokens):
    features = []
    for token in sentence_tokens:
        indexed_tokens = bert_tokenizer.convert_tokens_to_ids(token)
        segments_ids = [1] * len(token)
        
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        
        with torch.no_grad():
            outputs = bert_model(tokens_tensor, segments_tensors)
            hidden_states = outputs[2]
        
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)
        
        token_vecs_sum = []
        
        for token in token_embeddings:
            sum_vec = torch.sum(token[-4:], dim=0)
            token_vecs_sum.append(sum_vec.numpy())
            
        features.append(token_vecs_sum)
        
    return np.array(features)
            
        

#def get_context_feature_sum_alllayers(tokens):
    
#def get_context_feature_lasthiddenlayers(tokens):

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254728.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438869143.0, style=ProgressStyle(descri…




In [None]:
from keras.utils import to_categorical

categories = list(set(y_train))
num_class = len(categories)
#Convert label to categorical
def label_to_categorical(y):
    
    target_train = []
    for target in y:
        target_train.append(categories.index(target))
    
    target_train = to_categorical(np.array(target_train), num_classes=num_class)
    
    return target_train

In [None]:
print(np.array(text_train).shape)
#Tokenize docs
train_tokens = get_tokenize_data(text_train)
validation_tokens = get_tokenize_data(text_validation)
test_tokens = get_tokenize_data(text_test)

#Convert label to categorical
Y_train = label_to_categorical(y_train)
Y_validation = label_to_categorical(y_validation)
Y_test = label_to_categorical(y_test)

print(train_tokens.shape)
print(validation_tokens.shape)
print(test_tokens.shape)

print(Y_train.shape)
print(Y_validation.shape)
print(Y_test.shape)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(14548,)




(14548, 512)
(1840, 512)
(2307, 512)
(14548, 8)
(1840, 8)
(2307, 8)


**Penanganan Imbalance dataset**


Berdasarkan hasil grafik di atas dapat dilihat bahwa dalam keseluruhan dataset, kategori buku Literatur & Unterhaltung mendominasi. Untuk menangani ini, oversampling akan dilakukan. Dengan menduplikasi data yang minor diharapkan akan didapat sebaran yang lebih merata. Undersampling tidak diambil untuk menangani imbalance data disini, karena dengan undersampling data dengan major kategori Literatur & Unterhaltung akan dihapus sehingga akan menghilangkan beberapa informasi.

In [None]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import numpy as np

print(Counter(y_train))

# BERT with no Over Sampling
X_train_normal = train_tokens
Y_train_normal = label_to_categorical(y_train)

# BERT with Over Sampling
oversample = RandomOverSampler()
X_over, y_over = oversample.fit_resample(train_tokens, y_train)
# summarize class distribution
print(Counter(y_over))

Y_over = label_to_categorical(y_over)



Counter({'Literatur & Unterhaltung': 7622, 'Sachbuch': 1999, 'Kinderbuch & Jugendbuch': 1897, 'Ratgeber': 1630, 'Ganzheitliches Bewusstsein': 638, 'Glaube & Ethik': 502, 'Künste': 133, 'Architektur & Garten': 127})
Counter({'Literatur & Unterhaltung': 7622, 'Ratgeber': 7622, 'Künste': 7622, 'Sachbuch': 7622, 'Glaube & Ethik': 7622, 'Kinderbuch & Jugendbuch': 7622, 'Ganzheitliches Bewusstsein': 7622, 'Architektur & Garten': 7622})


In [None]:
#Training

from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Dropout, Flatten

#Create model
vocab_size = len(bert_tokenizer.vocab.keys())
max_length = 512

cnn_model_bert_no_over = Sequential()
cnn_model_bert_no_over.add(Embedding(vocab_size, 100, input_length=max_length))
cnn_model_bert_no_over.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
cnn_model_bert_no_over.add(MaxPooling1D(pool_size=2))
cnn_model_bert_no_over.add(Flatten())
cnn_model_bert_no_over.add(Dense(10, activation='relu'))
cnn_model_bert_no_over.add(Dense(8, activation='softmax')) 

print(cnn_model_bert_no_over.summary())

#Train data
cnn_model_bert_no_over.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#cnn_model_bert_no_over.fit(X_train_normal, Y_train_normal, epochs=10, verbose=2)


history = model.fit(X_train_normal, Y_train_normal, epochs=10, 
                    validation_data=(test_images, test_labels))
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 100)          3000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 505, 32)           25632     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 252, 32)           0         
_________________________________________________________________
flatten (Flatten)            (None, 8064)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                80650     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 88        
Total params: 3,106,370
Trainable params: 3,106,370
Non-trainable params: 0
______________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f44eabf8518>

In [None]:
#Evaluate through validation data
loss, acc = cnn_model_bert_no_over.evaluate(validation_tokens, Y_validation, verbose=0)
print('Validation Accuracy: %f' % (acc*100))
print('Validation Loss: %f' % (loss*100))

Validation Accuracy: 70.380437
Validation Loss: 225.750709


In [None]:
#Test
loss, acc = cnn_model_bert_no_over.evaluate(test_tokens, Y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))
print('Test loss: %f' % (loss*100))

In [None]:
#Training

from keras.models import Sequential
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Dropout, Flatten

#Create model
vocab_size = len(bert_tokenizer.vocab.keys())
max_length = 512

cnn_model = Sequential()
cnn_model.add(Embedding(vocab_size, 100, input_length=max_length))
cnn_model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(10, activation='relu'))
cnn_model.add(Dense(8, activation='softmax')) 

print(cnn_model.summary())

#Train data
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_over, Y_over, epochs=10, verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 100)          3000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 505, 32)           25632     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 252, 32)           0         
_________________________________________________________________
flatten (Flatten)            (None, 8064)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                80650     
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 88        
Total params: 3,106,370
Trainable params: 3,106,370
Non-trainable params: 0
______________________________________________

<tensorflow.python.keras.callbacks.History at 0x7fc4334c6588>

In [None]:
#Evaluate through validation data
loss, acc = cnn_model.evaluate(validation_tokens, Y_validation, verbose=0)
print('Validation Accuracy: %f' % (acc*100))

Validation Accuracy: 72.582972


In [None]:
#Test
loss, acc = cnn_model.evaluate(test_tokens, Y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))
print('Test loss: %f' % (loss*100))

Test Accuracy: 73.225886
Test loss: 333.219218


In [None]:
pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████▊                           | 10kB 15.0MB/s eta 0:00:01[K     |█████████▌                      | 20kB 20.8MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 11.5MB/s eta 0:00:01[K     |███████████████████             | 40kB 9.6MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 5.1MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 5.2MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.3MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp36-cp36m-linux_x86_64.whl size=3044500 sha256=4e03a340faf776e46f6548f6ab39a1a4c8d1103fdb4c9677eb328af505aa53cb
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c1

In [None]:
def create_cnn_model(_vocab_size):
  # vocab_size = len(bert_tokenizer.vocab.keys())
  vocab_size = _vocab_size
  max_length = 512

  cnn_model = Sequential()
  cnn_model.add(Embedding(vocab_size, 100, input_length=max_length))
  cnn_model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
  cnn_model.add(MaxPooling1D(pool_size=2))
  cnn_model.add(Flatten())
  cnn_model.add(Dense(10, activation='relu'))
  cnn_model.add(Dense(8, activation='softmax')) 

  print(cnn_model.summary())

  return cnn_model

In [None]:
import fasttext

def get_fasttext_vector(text):
  model = fasttext.train_unsupervised(text)

  return model

get_fasttext_vector(text_train)

TypeError: ignored