In [None]:
from google.colab import drive
# Mount google drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SLU Semesters/SLU 3rd Semester/NLP/Second Competition/
%ls

In [79]:
import pandas as pd
# Load dataset file
df_total = pd.read_csv("train.tsv", sep='\t', names=["word", "tag"])
df_total.head()

Unnamed: 0,word,tag
0,ansin,N
1,),N
2,tá,N
3,níos,N
4,lú,N


In [80]:
# df2 = df_total[:15000]
df2 = df_total.copy()
df2.describe()
print((df2['tag'].value_counts()))
lbl = df2['tag'].value_counts().max()
max_label = df2['tag'][lbl]
max_label

N    4339849
S     493102
U     165818
H      40569
T      17721
Name: tag, dtype: int64


'N'

In [81]:
# converting input shape for tagger
df3 = [[(df2.loc[i,'word'], df2.loc[i,'tag'])] for i in range(len(df2))]

In [82]:
# creating train and test datasets - source: https://colab.research.google.com/drive/1pAuj6j5UAzBrGHPmgP5IufT35O9E6lvt?usp=sharing
split = int(len(df3)*0.9)
train = df3[:split]
test = df3[split:]
len(train)

4551353

**Unigram tagger**

In [83]:
# unigram tagger 
import nltk
unigram_tagger = nltk.UnigramTagger(train)
print("Unigram tagger accuracy without backoff tagger is: ", unigram_tagger.accuracy(test))

Unigram tagger accuracy without backoff tagger is:  0.8941875318861157


In [84]:
from nltk.tag import SequentialBackoffTagger
from nltk.tag import DefaultTagger 
from nltk.tag import UnigramTagger 

back_tagger = DefaultTagger(max_label)
unigram_tagger2 = UnigramTagger(train, backoff = back_tagger)
print("Unigram tagger accuracy with backoff tagger is: ", unigram_tagger2.accuracy(test))

Unigram tagger accuracy with backoff tagger is:  0.907707244920962


**RNN**

In [104]:
df_rnn = df_total[:15000]
df_rnn2 = [[(df2.loc[i,'word'], df2.loc[i,'tag'])] for i in range(len(df_rnn))]

# creating vocab and tags lists from df for word2index and tag2index - source: https://colab.research.google.com/drive/1D8aeXQVb_aVcfr7thA7UYlmD0Tq_XjPU?usp=sharing
vocab = list(df_rnn['word'])
vocab.append('<PAD>')

tags = list(df_rnn['tag'])
tags.append('<PAD>')

print(len(vocab))
print(len(tags))

15001
15001


In [105]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
# source: https://colab.research.google.com/drive/1D8aeXQVb_aVcfr7thA7UYlmD0Tq_XjPU?usp=sharing
max_len = 2
word2index = {w: i for i, w in enumerate(vocab)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in df_rnn2]
X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

In [106]:
print(len(X))
print(type(X))
print(X.shape)
X[1]

15000
<class 'numpy.ndarray'>
(15000, 2)


array([14746, 15000], dtype=int32)

In [109]:
from tensorflow.keras.utils import to_categorical

onehot_y = [[tag2index[w[1]] for w in s] for s in df_rnn2]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
y = to_categorical(y, num_classes=len(tags))

In [110]:
print(len(y))
print(type(y))
y.shape

15000
<class 'numpy.ndarray'>


(15000, 2, 15001)

In [111]:
# creaint train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))
print(X_train.shape)
print(X_test.shape)

13500
1500
13500
1500
(13500, 2)
(1500, 2)


In [112]:
%pip install -q -U keras-tuner
import keras_tuner as kt

In [113]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 

# source: Tensorflow tutorials https://www.tensorflow.org/tutorials/keras/keras_tuner

def model_builder(hp):
  model = keras.Sequential()

  # Tune the number of units in the first Dense layer - Choose an optimal value between 32-512
  hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
  model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
  model.add(Bidirectional(LSTM(units=hp_units, return_sequences=True, recurrent_dropout=0.1)))
  model.add(TimeDistributed(Dense(len(tags), activation="softmax")))


  # Tune the learning rate for the optimizer - Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]) 
  model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
  

  return model

In [114]:
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=3,
                     factor=3)
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)



In [115]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [117]:
tuner.search(X_train, y_train, epochs= 3, validation_split=0.2, callbacks=[stop_early])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print("The optimal number of units in the first densely-connected layer is: ", best_hps.get('units') , " and the optimal learning rate for the optimizer is "
, best_hps.get('learning_rate'))

Trial 42 Complete [00h 01m 15s]
val_accuracy: 0.9444444179534912

Best val_accuracy So Far: 0.9972222447395325
Total elapsed time: 00h 15m 15s
The optimal number of units in the first densely-connected layer is:  96  and the optimal learning rate for the optimizer is  0.001


In [100]:
# Build the model with the optimal hyperparameters and train it on the data for number of epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs= 5, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Best epoch: 2


**HMM**

In [101]:
# Import the toolkit and tags
import nltk

# Import HMM module
from nltk.tag import hmm

# Setup a trainer with default(None) values and train with the data
tagger = nltk.HiddenMarkovModelTagger.train(train)
# Prints the basic data about the tagger
print(tagger)

<HiddenMarkovModelTagger 5 states and 118706 output symbols>


In [102]:
print(tagger.accuracy(test))

0.9078239134991477
