# Tugas 2 Pemrosesan Bahasa Alami - POSTagger

## Import All Depedencies

In [1]:
import pandas as pd
import operator
from keras.layers import Embedding, LSTM, Bidirectional, TimeDistributed, Dense, Input
from keras.models import Model
import numpy as np
import copy

from nltk import word_tokenize
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

Using TensorFlow backend.


## Load Dataset
Load 1020 first line of dataset and split into 1000 data for training set and 20 data for test set.

In [2]:
def load_dataset(filename, number_of_sentences=1020):
    '''Fungsi untuk membaca dataset sejumlah kalimat yang diinginkan.'''
    sentences = []
    tags = []
    
    with open(filename) as file:
        contents = file.readlines()
    
    contents = [content.strip() for content in contents]
    idx_line = 0
    while idx_line < len(contents):
        sentence = []
        tag = []
        while not contents[idx_line].startswith('</kalimat'):
            if not contents[idx_line].startswith('<kalimat'):
                content = contents[idx_line].split('\t')
                sentence.append(content[0].lower())
                tag.append(content[1])
            idx_line += 1
        sentences.append(sentence)
        tags.append(tag)
        idx_line += 2
        if len(sentences) >= number_of_sentences:
            break
    return sentences, tags


sentences, tags = load_dataset('dataset/dataset.tsv')
sentences_train, tags_train, sentences_test, tags_test = sentences[:1000], tags[:1000], sentences[1000:1020], tags[1000:1020]

print(len(tags), len(sentences))
print(sentences_train[0], tags_train[0])
print(sentences_test[0], tags_test[0])

1020 1020
['kera', 'untuk', 'amankan', 'pesta olahraga'] ['NN', 'SC', 'VB', 'NN']
['pemeringkat', 'efek', 'indonesia', 'pefindo', 'menegaskan', 'peringkat', 'ida-', 'untuk', 'pt', 'serasi', 'autoraya', 'sera', 'dan', 'outstanding', 'obligasi', 'i', 'tahun', '2003', 'sebesar', 'rp', '165', 'miliar', 'yang', 'akan', 'jatuh tempo', 'pada', 'juli', '2008', 'dengan', 'prospek', 'peringkat', 'tersebut', 'stabil', ',', 'kata', 'pefindo', 'dalam', 'pernyataan', '-nya', 'di', 'jakarta', ',', 'rabu', '.'] ['NNP', 'NNP', 'NNP', 'NNP', 'VB', 'NN', 'SYM', 'IN', 'NNP', 'NNP', 'NNP', 'NNP', 'CC', 'FW', 'NNP', 'NNP', 'NN', 'CD', 'JJ', 'SYM', 'CD', 'CD', 'SC', 'MD', 'VB', 'IN', 'NNP', 'CD', 'SC', 'NN', 'NN', 'PR', 'JJ', 'Z', 'VB', 'NNP', 'IN', 'NN', 'PRP', 'IN', 'NNP', 'Z', 'NNP', 'Z']


## POSTagger Method
Implement 3 POSTagger method:
1. Baseline
2. Statistic based (Feature Extraction and Neural Network)
3. Sequence Model based (HMM-Viterbi)
4. Long-Short Term Memory (Bonus - On Progress)

### Baseline Method
Get the highest tag frequencies of the words.

In [3]:
def get_word_tag_frequencies(sentences, tags):
    '''Fungsi untuk mendapatkan semua kata dan jumlah tagnya pada keseluruhan dataset.'''
    words_tag = {}
    for sentence, tag in zip(sentences, tags):
        for word, tag_word in zip(sentence, tag):
            if word in words_tag.keys():
                if tag_word in words_tag[word].keys():
                    words_tag[word][tag_word] += 1
                else:
                    words_tag[word][tag_word] = 1
            else:
                words_tag[word] = {tag_word: 1}
    return words_tag

def get_baseline_model(word_tag_frequencies):
    '''Mendapatkan model baseline. Kata dengan tag terbanyak pada tiap kata.'''
    baseline_model = {}
    for word, tags in word_tag_frequencies.items():
        baseline_model[word] = max(tags.items(), key=operator.itemgetter(1))[0] # Megambil tag dengan jumlah terbanyak pada suatu kata
    baseline_model['OOV'] = 'NN' # Menambahkan token Out Of Vocabulary untuk kata yang tidak ada di vocabulary akan dijadikan NN
    return baseline_model

def get_accuracy(sentences, tags, baseline_model):
    '''Mendapatkan akurasi dari model baseline'''
    n_true = 0
    n_word = 0
    for sentence, tag in zip(sentences, tags):
        for word, tag_word in zip(sentence, tag):
            n_word += 1
            if word not in baseline_model:
                word = 'OOV' # Kata menjadi OOV jika tidak ada di dalam baseline model
            if baseline_model[word] == tag_word:
                n_true += 1
    return n_true / n_word * 100

def post_tag_baseline_model(sentence, baseline_model):
    tag = []
    for word in sentence.split():
        if word not in baseline_model:
            tag.append('NN')
        else:
            tag.append(baseline_model[word])
    return tag
    
    
word_tag = get_word_tag_frequencies(sentences_train, tags_train)
baseline_model = get_baseline_model(word_tag)
# print(baseline_model)
print('Baseline Method Accuracy: ', get_accuracy(sentences_test, tags_test, baseline_model), '%')
print('saya menyukai makan nasi')
print(post_tag_baseline_model('saya menyukai makan nasi', baseline_model))

Baseline Method Accuracy:  85.44117647058823 %
saya menyukai makan nasi
['PRP', 'VB', 'VB', 'NN']


### Neural Network

In [4]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    }


 
def transform_to_dataset(sentences, tags):
    '''Mengubah kalimat di dataset menjadi siap untuk diolah'''
    X, y = [], []
    for sentence_idx in range(len(sentences)):
        for index in range(len(sentences[sentence_idx])):
            X.append(features(sentences[sentence_idx], index))
            y.append(tags[sentence_idx][index])
 
    return X, y


def get_neural_network_classifier():
    '''Mendapatkan model neural network.'''
    return Pipeline([
        ('vectorizer', DictVectorizer(sparse=False)),
        ('classifier', MLPClassifier(hidden_layer_sizes=(50, 25), learning_rate='adaptive', verbose=True, max_iter=5))
    ])


def pos_tag(sentence):
    '''Menampilkan prediksi POS tag pada suatu kalimat.'''
    tags = clf.predict([features(sentence, index) for index in range(len(sentence))])
    return tags


X, y = transform_to_dataset(sentences_train, tags_train)

clf = get_neural_network_classifier()
clf.fit(X, y)   

print('Training completed')

X_test, y_test = transform_to_dataset(sentences_test, tags_test) 
print("Neural Network Accuracy:")
print(clf.score(X_test, y_test))
 
print('saya menyukai makan nasi')
print(pos_tag(word_tokenize('saya menyukai makan nasi')))

Iteration 1, loss = 1.76796206
Iteration 2, loss = 0.39087079
Iteration 3, loss = 0.17088821
Iteration 4, loss = 0.09164622
Iteration 5, loss = 0.05681595
Training completed




Neural Network Accuracy:
0.9264705882352942
saya menyukai makan nasi
['PRP' 'VB' 'VB' 'NN']


### Sequence based model - HMM with Viterbi Algorithm

In [5]:
def get_tag_count(tags):
    '''Mendapatkan tag dan jumlahnya'''
    tag_count = {}
    for tag in tags:
        for tag_word in tag:
            if tag_word in tag_count:
                tag_count[tag_word] += 1
            else:
                tag_count[tag_word] = 1
    tag_count['<start>'] = len(tags)
    return tag_count


def get_tag_transition(tags):
    '''Mendapatkan transisi dari tag ke tag lainnya dan jumlahnya.'''
    tag_trans = {}
    for tag in tags:
        previous_word = '<start>'
        for tag_word in tag:
            if previous_word in tag_trans:
                if tag_word in tag_trans[previous_word]:
                    tag_trans[previous_word][tag_word] += 1
                else:
                    tag_trans[previous_word][tag_word] = 1
            else:
                tag_trans[previous_word] = {tag_word: 1}
            previous_word = tag_word
    return tag_trans


def get_trans_prob_table(tag_trans, tag_count):
    '''Mendapatkan table transisi dengan inputan transisi tag dan jumlah tag'''
    trans_prob_table = {}
    for tag1 in tag_count.keys():
        trans_prob_table[tag1] = {}
        for tag2 in tag_count.keys():
            if tag2 in tag_trans[tag1].keys():
                trans_prob_table[tag1][tag2] = tag_trans[tag1][tag2]/tag_count[tag1]
            else:
                continue
    return trans_prob_table


def get_emission_prob_table(word_tag, tag_count):
    '''Mendapatkan tabel emisi dengan input tag dari word dan perhitungan jumlah tag.'''
    emission_prob_table = copy.deepcopy(word_tag)
    for word, tags in word_tag.items():
        for tag in tags.keys():
            emission_prob_table[word][tag] = word_tag[word][tag] / tag_count[tag]
    return emission_prob_table
    
def viterbi_algorithm(sentence, transition_table, emission_table):
    '''Memprediksi POS tag kalimat dari kalimat yang diberikan berdasarkan algoritma Viterbi'''
    sentence_tag = []
    viterbi_table  = {}
    max_previous_value = 1
    previous_tag = '<start>'
    for word in sentence.split():
        viterbi_table[word] = {}
        if word not in emission_table:
#             sentence_tag.append('NN')
            viterbi_table[word]['NN'] = max_previous_value * transition_table[previous_tag]['NN']
        else:
            for tag in emission_table[word]:
                if tag in transition_table[previous_tag]:
                    viterbi_table[word][tag] = max_previous_value * transition_table[previous_tag][tag] * emission_table[word][tag]
        previous_tag = max(viterbi_table[word].items(), key=operator.itemgetter(1))[0]
        max_previous_value = viterbi_table[word][previous_tag]
        sentence_tag.append(previous_tag)
    return sentence_tag, viterbi_table

def get_viterbi_accuracy(sentences, tags, viterbi_algorithm, transition_table, emission_table):
    '''Mendapatkan akurasi dari model Viterbi'''
    n_true = 0
    n_word = 0
    for sentence, tag in zip(sentences, tags):
        sentence = ' '.join(sentence)
        pred_tag, _ = viterbi_algorithm(sentence, transition_table, emission_table)
        for pred, truth in zip(pred_tag, tag):
            n_word +=1
            if pred == truth:
                n_true += 1
    return n_true / n_word * 100

tag_count = get_tag_count(tags_train)
word_tag = get_word_tag_frequencies(sentences_train, tags_train)
tag_trans = get_tag_transition(tags_train)
trans_prob_table = get_trans_prob_table(tag_trans, tag_count)
emission_prob_table = get_emission_prob_table(word_tag, tag_count)
print("Viterbi Algorithm Accuracy:")
print(get_viterbi_accuracy(sentences_test, tags_test, viterbi_algorithm, trans_prob_table, emission_prob_table))
print('saya menyukai makan nasi')
print(viterbi_algorithm('saya menyukai makan nasi', trans_prob_table, emission_prob_table)[0])

Viterbi Algorithm Accuracy:
63.382352941176464
saya menyukai makan nasi
['PRP', 'VB', 'NN', 'NN']


### Statistical Method using Long-Short Term Memory

#### Preprocess Data
Preprocess data to fit for training using LSTM model.

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

tokenizer_sentences = Tokenizer()
tokenizer_sentences.fit_on_texts(sentences)

tokenizer_tags = Tokenizer()
tokenizer_tags.fit_on_texts(tags)


def preprocess_sentences(tokenizer, sentences, max_sequence_length=100):
    sequence_sentences = tokenizer.texts_to_sequences(sentences)
    padded_sequence = pad_sequences(sequence_sentences, maxlen=max_sequence_length, padding='pre', truncating='pre')
    return padded_sequence

def tags_to_categorical(tokenizer, tags, max_sequence_length=100):
    sequence_tags = tokenizer.texts_to_sequences(tags)
    padded_tags = pad_sequences(sequence_tags, maxlen=max_sequence_length, padding='pre', truncating='pre')
    return padded_tags

def one_hot_tags(tags_categorical):
    one_hot = []
    for tags in tags_categorical:
        one_hot_sentence = []
        for tag in tags:
            one_hot_tag = np.zeros(len(tokenizer_tags.word_counts)+1)
            one_hot_tag[tag] = 1
            one_hot_sentence.append(one_hot_tag)
        one_hot.append(one_hot_sentence)
    return one_hot

sequence_train = preprocess_sentences(tokenizer_sentences, sentences_train)
tags_categorical = tags_to_categorical(tokenizer_tags, tags_train)

tags_categorical = one_hot_tags(tags_categorical)

In [7]:
def get_LSTM_model(n_word, n_tag, max_sequence_length=100):
    embedding_layer = Embedding(input_dim=n_word+1, output_dim=128)
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedding_sequences = embedding_layer(sequence_input)
    
    lstm = Bidirectional(LSTM(64, return_sequences=True))(embedding_sequences)
    predictions = TimeDistributed(Dense(n_tag + 1, activation='softmax'))(lstm)
    
    return Model(sequence_input, predictions)

model = get_LSTM_model(len(tokenizer_sentences.word_counts), len(tokenizer_tags.word_counts))
model.compile(loss='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['accuracy'])
model.fit(sequence_train, tags_categorical[0])

ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 100 arrays: [array([[1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
    ...