In [None]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, TimeDistributed, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
from sklearn.metrics import f1_score


def load_data(file_path):
    with open(file_path, "r") as f:
        lines = f.readlines()

    sentences, labels = [], []
    sentence, label = [], []

    for line in lines:
        if line == "\n" or line.startswith("-DOCSTART-"):
            if sentence and label:
                sentences.append(sentence)
                labels.append(label)
                sentence, label = [], []
        else:
            word, _, _, tag = line.strip().split()
            sentence.append(word.lower())  # Normalize the case
            label.append(tag)

    return sentences, labels

train_file_path = "data/conllpp_train.txt"
test_file_path = "data/conllpp_test.txt"

train_sentences, train_labels = load_data(train_file_path)
test_sentences, test_labels = load_data(test_file_path)

# Load pre-trained word embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create word-to-index and tag-to-index dictionaries
words = list(set([word for sentence in train_sentences + test_sentences for word in sentence]))
words.append('ENDPAD')
n_words = len(words)
tags = list(set([tag for label in train_labels + test_labels for tag in label]))
n_tags = len(tags)

word_index = {w: i for i, w in enumerate(words)}
label_index = {t: i for i, t in enumerate(tags)}

# Convert words and tags to sequences of indices
X_train = [[word_index[w] for w in sentence] for sentence in train_sentences]
X_train = pad_sequences(maxlen=50, sequences=X_train, padding='post', value=n_words-1)
y_train = [[label_index[t] for t in label] for label in train_labels]
y_train = pad_sequences(maxlen=50, sequences=y_train, padding='post', value=label_index['O'])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

X_test = [[word_index[w] for w in sentence] for sentence in test_sentences]
X_test = pad_sequences(maxlen=50, sequences=X_test, padding='post', value=n_words-1)
y_test = [[label_index[t] for t in label] for label in test_labels]
y_test = pad_sequences(maxlen=50, sequences=y_test, padding='post', value=label_index['O'])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

max_len  = 50
# Load pre-trained word embeddings
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Define model architecture
input_layer = Input(shape=(50,))
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False)(input_layer)
dropout_layer = Dropout(0.5)(embedding_layer)
bidirectional_layer = Bidirectional(LSTM(128, return_sequences=True))(dropout_layer)
output_layer = TimeDistributed(Dense(len(label_index), activation='softmax'))(bidirectional_layer)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model.fit(X_train, np.array(y_train), validation_split=0.1, batch_size=32, epochs=10, callbacks=[early_stopping])

# # Evaluate model
# y_pred = model.predict(X_test)
# y_pred = np.argmax(y_pred, axis=-1)
# y_test_labels = [[tags[i] for i in row] for row in np.argmax(y_test, axis=-1)]
# y_pred_labels = [[tags[i] for i in row] for row in y_pred]
# f1 = f1_score(y_test_labels, y_pred_labels, average='weighted')
# print("F1-score: {:.2f}".format(f1))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff59c7c1210>

In [None]:
from sklearn.metrics import classification_report
# Evaluate model
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=-1)
y_test_labels = [[tags[i] for i in row] for row in np.argmax(y_test, axis=-1)]
y_pred_labels = [[tags[i] for i in row] for row in y_pred]


# Print classification report
report = classification_report(y_test_labels_flat, y_pred_labels_flat)
print(report)


              precision    recall  f1-score   support

       B-LOC       0.86      0.75      0.80      1630
      B-MISC       0.82      0.68      0.74       721
       B-ORG       0.69      0.65      0.67      1708
       B-PER       0.91      0.53      0.67      1579
       I-LOC       0.71      0.58      0.64       258
      I-MISC       0.68      0.46      0.55       252
       I-ORG       0.51      0.60      0.55       880
       I-PER       0.93      0.61      0.73      1119
           O       0.99      1.00      0.99    164453

    accuracy                           0.98    172600
   macro avg       0.79      0.65      0.71    172600
weighted avg       0.98      0.98      0.98    172600

