In [1]:
%pip install numpy scikit-learn tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
def transform_dataset(page_dataset, for_inference):
    labeled_text_dataset = []
    for page in page_dataset:
        page_words = page["representativeData"]["page_data_words"]
        
        geo_dictionary = {}
        if not for_inference:
            page_answers = page.get("answers")
            for page_answer in page_answers[0]["answer"]:
                geo_label = page_answer["id"]
                for geo_part in page_answer["data"]:
                    for index in range(geo_part["start"], geo_part["end"]):
                        geo_dictionary[index] = geo_label
        
        labeled_text = []
        for word_index, word in enumerate(page_words):
            word_label = "0" if for_inference else geo_dictionary.get(word_index, "O")
            labeled_text.append((word, word_label))
        
        if not for_inference:
            labeled_text_dataset.append(labeled_text)
        else:
            labeled_text_dataset.append((page["taskId"], labeled_text))
    
    return labeled_text_dataset

In [3]:
import json

def get_labeled_dataset(dataset_path, for_inference=False):
    with open(dataset_path, encoding="utf-8") as json_dataset:
        dataset = json.load(json_dataset)
        
    labeled_dataset = transform_dataset(dataset["data"]["results"], for_inference)
    return labeled_dataset

In [4]:
def get_validation_result(X_validation, y_pred):
    validation_result = []
    
    for ((task_id, _), predictions) in zip(X_validation, y_pred):
        answers = {}
        current_label = None
        start_index = None
        
        for current_index, label in enumerate(predictions):
            if label == current_label:
                continue
            else:
                if current_label is not None and current_label != "O":
                    if current_label not in answers:
                        answers[current_label] = []
                    answers[current_label].append({"start": start_index, "end": current_index})
                
                if label != "0":
                    current_label = label
                    start_index = current_index
                else:
                    current_label = None
    
        if current_label is not None and current_label != "O":
            if current_label not in answers:
                answers[current_label] = []
            answers[current_label].append({"start": start_index, "end": len(predictions)})
        
        validation_answers = []
        for label, segments in answers.items():
            validation_answers.append({"id": label, "data": segments})
        
        validation_result.append({
            "taskId": task_id,
            "answer": validation_answers
        })
        
    return validation_result

In [5]:
import tensorflow as tf

def focal_loss(alpha=0.25, gamma=2.):
    def focal_loss_parametrized(y_true, y_pred):
        e = 1.e-9
        y_true = tf.convert_to_tensor(y_true, tf.float32)
        y_pred = tf.convert_to_tensor(y_pred, tf.float32)
        
        model_output = tf.add(y_pred, e)
        ce = tf.multiply(y_true, -tf.math.log(model_output))
        w = tf.multiply(y_true, tf.pow(tf.subtract(1., model_output), gamma))
        fl = tf.multiply(alpha, tf.multiply(w, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    
    return focal_loss_parametrized

In [6]:
train_dataset = get_labeled_dataset("../jsons/train_geo_extractor.json")

In [7]:
max_text_length = max([len(text) for text in train_dataset])

words = [word for text in train_dataset for word, _ in text]
words.append("UNKNOWN")
words.append("ENDPAD")
words = list(set(words))

labels = list(set([label for text in train_dataset for _, label in text]))

In [8]:
word2index = {word: index for index, word in enumerate(words)}
label2index = {label: index for index, label in enumerate(labels)}

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = [[word2index[word] for word, _ in text] for text in train_dataset]
X_train = pad_sequences(maxlen=max_text_length, sequences=X_train, padding="post", value=len(words) - 1)

y_train = [[label2index[label] for _, label in text] for text in train_dataset]
y_train = pad_sequences(maxlen=max_text_length, sequences=y_train, padding="post", value=label2index["O"])

In [10]:
from tensorflow.keras.utils import to_categorical

y_train = [to_categorical(index, num_classes=len(labels)) for index in y_train]

In [13]:
import numpy as np
from uuid import uuid4
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed, BatchNormalization
from tensorflow.keras.layers import Embedding, Dropout, Dense, Bidirectional, LSTM

# Your model architecture
model_input = Input(shape=(max_text_length,))
embedding_output = Embedding(input_dim=len(words), output_dim=max_text_length, input_length=max_text_length)(model_input)
dropout_output = Dropout(0.1)(embedding_output)
lstm_output = Bidirectional(LSTM(units=300, return_sequences=True))(dropout_output)
model_output = TimeDistributed(Dense(len(labels), activation="softmax"))(lstm_output)
model = Model(model_input, model_output)

# Compilation
model.compile(optimizer="adam", loss=focal_loss(), metrics=["accuracy"])

# Training
model.fit(X_train, np.array(y_train), batch_size=16, epochs=5)

# Save the model
random_model_name = str(uuid4())
model.save(f"saved_model/{random_model_name}")
print(f"Model {random_model_name} has successfully been saved!")


Epoch 1/5
[1m  2/175[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:46:29[0m 37s/step - accuracy: 0.2490 - loss: 0.2186

In [None]:
%pip install numpy scikit-learn gensim tf2crf tensorflow

In [None]:
import os
from gensim.models import FastText

sentences = [[word for word, _ in text] for text in train_dataset]

model = FastText(sentences, vector_size=100, window=3, min_count=1, workers=os.cpu_count(), sg=1)

In [None]:
import numpy as np

embedding_matrix = np.zeros((len(word2index), 100))

for word, index in word2index.items():
    embedding_vector = model.wv[word]
    embedding_matrix[index] = embedding_vector

In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(input_dim=len(word2index),
                           output_dim=100,
                           weights=[embedding_matrix],
                           input_length=max_text_length,
                           trainable=False)

In [None]:
import os
from uuid import uuid4
import numpy as np
from tf2crf import CRF, ModelWithCRFLoss
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, TimeDistributed
from tensorflow.keras.layers import Embedding, Dropout, Dense
from tensorflow.keras.layers import Bidirectional, LSTM, Attention

model_input = Input(shape=(max_text_length, ))
model = Embedding(input_dim=len(words), output_dim=max_text_length, input_length=max_text_length)(model_input)
model = Dropout(0.1)(model)

lstm_output = Bidirectional(LSTM(units=100, return_sequences=True))(model)

crf = CRF(dtype="float32")
model_output = crf(lstm_output)

hybrid_model = Model(model_input, model_output)
model = ModelWithCRFLoss(hybrid_model)
model.compile(optimizer="adam")

model.fit(X_train, np.array(y_train), batch_size=8, epochs=5, use_multiprocessing=True, workers=os.cpu_count())

random_model_name = str(uuid4())
model.save(f"saved_model/{random_model_name}")
print(f"Model {random_model_name} has successfully been saved!")

In [None]:
test_dataset = get_labeled_dataset("datasets/test_geo_extractor.json")

In [None]:
import os
from tensorflow.keras import models

model_name = input("Enter the model name: ")
model_path = "saved_model/" + model_name

if not os.path.exists(model_path):
    print(f"The model {model_name} does not exist!")

recognizer = models.load_model(model_path, custom_objects={"focal_loss_parametrized": focal_loss})

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_test = [[word2index.get(word, word2index["UNKNOWN"]) for word, _ in text] for text in test_dataset]
X_test = pad_sequences(maxlen=max_text_length, sequences=X_test, padding="post", value=len(word2index) - 1)

In [None]:
predictions = recognizer.predict(X_test, use_multiprocessing=True, workers=os.cpu_count())

In [None]:
import numpy as np

y_pad_pred_test = [[labels[np.argmax(prediction)] for prediction in text_prediction]
                   for text_prediction in predictions]

In [None]:
y_pred_test = []

for i, text in enumerate(test_dataset):
    text_predictions = []
    for j, (word, _) in enumerate(text):
        if j < len(y_pad_pred_test[i]):
            text_predictions.append((word, y_pad_pred_test[i][j]))
            
    y_pred_test.append(text_predictions)

In [None]:
from sklearn.metrics import classification_report, matthews_corrcoef

y_test_flat = [label for text in test_dataset for _, label in text]
y_pred_flat = [label for text in y_pred_test for _, label in text]

print(classification_report(y_test_flat, y_pred_flat))
print(f"Matthews Correlation Coefficient: {matthews_corrcoef(y_test_flat, y_pred_flat)}")

In [None]:
validation_dataset = get_labeled_dataset("datasets/val_no_answer_geo_extractor.json", for_inference=True)

In [None]:
X_validation = [[word2index.get(word, word2index["UNKNOWN"]) for word, _ in text]
                for task_id, text in validation_dataset]
X_validation = pad_sequences(maxlen=max_text_length, sequences=X_validation, padding="post",
                             value=len(word2index) - 1)

In [None]:
y_pred_validation = recognizer.predict(X_validation)

X_validation = [(task_id, text) for task_id, text in validation_dataset]

In [None]:
y_pad_pred_validation = [[labels[np.argmax(prediction)] for prediction in text_prediction]
              for text_prediction in y_pred_validation]

In [None]:
y_pred_validation = []

for i, text in enumerate(validation_dataset):
    text_predictions = []
    for j, (word, _) in enumerate(text[1]):
        if j < len(y_pad_pred_validation[i]):
            text_predictions.append((word, y_pad_pred_validation[i][j]))
            
    y_pred_validation.append(text_predictions)

In [None]:
import json

y_pred_validation = [[label for _, label in text] for text in y_pred_validation]

validation_result = get_validation_result(X_validation, y_pred_validation)

with open("lstm_validation_result.json", "w", encoding="utf-8") as file:
    json.dump(validation_result, file, ensure_ascii=False, indent=4)

print("Validation result has been saved!")

In [None]:
!pip install numpy scikit-learn torch datasets transformers

In [None]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_name = "xlm-roberta-large-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=10, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
label_list = ["O", "central_city", "geo_address", "geo_building", "geo_city",
              "geo_district", "geo_microdistrict", "geo_region",
              "geo_region_oblast", "geo_street"]

label_dictionary = {label: i for i, label in enumerate(label_list)}

In [None]:
def tokenize_and_align_labels(dataset):
    texts = [" ".join([word for word, label in text]) for text in dataset]
    tokenized_inputs = tokenizer(texts, padding=True, truncation=True, 
                                 is_split_into_words=False, return_tensors="pt")
    
    encoded_labels = []
    for i, text in enumerate(dataset):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_dictionary[text[word_idx][1]])
            else:
                label_ids.append(0)
            previous_word_idx = word_idx
        encoded_labels.append(label_ids)
    
    tokenized_inputs["labels"] = encoded_labels
    return tokenized_inputs

In [None]:
from datasets import Dataset

tokenized_train_dataset = tokenize_and_align_labels(train_dataset)
tokenized_test_dataset = tokenize_and_align_labels(test_dataset)

transformed_train_dataset = Dataset.from_dict(tokenized_train_dataset)
transformed_test_dataset = Dataset.from_dict(tokenized_test_dataset)
print(transformed_test_dataset)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != 0]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != 0]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "ner",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no',
    report_to='none'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=transformed_train_dataset,
    eval_dataset=transformed_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)

In [None]:
import numpy as np

predictions_logits = predictions.predictions
true_labels = predictions.label_ids

predicted_labels = np.argmax(predictions_logits, axis=2)

true_labels = [[label for label in sentence if label != -100] for sentence in true_labels]
predicted_labels = [
    [p for (p, label) in zip(prediction, labels) if label != -100]
    for prediction, labels in zip(predicted_labels, true_labels)
]

In [None]:
id_to_label = {id: label for label, id in label_to_id.items()}

def labels_to_names(labels, id_to_label):
    return [[id_to_label[label] for label in sentence] for sentence in labels]

predicted_label_names = labels_to_names(predicted_labels, id_to_label)
true_label_names = labels_to_names(true_labels, id_to_label)

In [None]:
from sklearn.metrics import classification_report
import itertools

true_labels_flat = list(itertools.chain(*true_label_names))
predicted_labels_flat = list(itertools.chain(*predicted_label_names))

print(classification_report(true_labels_flat, predicted_labels_flat))

In [None]:
!pip install numpy pandas tqdm scikit-learn tensorflow transformers

In [None]:
train_dataset = get_labeled_dataset("datasets/train_geo_extractor.json")

In [None]:
from sklearn import preprocessing

sentences = []
labels = []

label_list = ["O", "central_city", "geo_address", "geo_building", "geo_city",
              "geo_district", "geo_microdistrict", "geo_region",
              "geo_region_oblast", "geo_street"]

label_dictionary = {label: i for i, label in enumerate(label_list)}

for text in train_dataset:
    sentences.append([word for word, _ in text])
    labels.append([label for _, label in text])

encoded_labels = []

for labels_for_sentence in labels:
    encoded_labels.append([label_dictionary.get(label) for label in labels_for_sentence])

In [None]:
import numpy as np
from tqdm import tqdm
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

max_sentence_length = max([len(text) for text in train_dataset])

def tokenize(data, max_len):
    input_ids = list()
    attention_mask = list()
    
    for index in tqdm(range(len(data))):
        encoded_data = tokenizer.encode_plus(data[index],
                                            add_special_tokens=True,
                                            max_length=max_len,
                                            is_split_into_words=True,
                                            return_attention_mask=True,
                                            padding="max_length",
                                            truncation=True,
                                            return_tensors="np")
        
        input_ids.append(encoded_data["input_ids"])
        attention_mask.append(encoded_data["attention_mask"])
        
    return np.vstack(input_ids), np.vstack(attention_mask)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, encoded_labels, test_size=0.1, random_state=42)

input_ids, attention_mask = tokenize(X_train, max_len=max_sentence_length)
val_input_ids, val_attention_mask = tokenize(X_test, max_len=max_sentence_length)

In [None]:
def pad_labels(input_labels, max_len):
    padded_labels = list()

    for index in range(len(input_labels)):
        padded_labels.append(np.array(input_labels[index] + [0] * (max_len - len(input_labels[index]))))
    
    return padded_labels

In [None]:
train_labels = pad_labels(y_train, max_sentence_length)
test_labels = pad_labels(y_test, max_sentence_length)

In [None]:
from tensorflow.keras.utils import to_categorical

train_labels = [to_categorical(index, num_classes=len(label_list)) for index in train_labels]
test_labels = [to_categorical(index, num_classes=len(label_list)) for index in test_labels]

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Accuracy

def instantiate_model(bert_model, max_len):
    input_ids = Input(shape=(max_len, ), dtype="int32")
    attention_mask = Input(shape=(max_len, ), dtype="int32")
    bert_layer = bert_model(input_ids, attention_mask=attention_mask, return_dict=True)
    
    embedding_layer = Dropout(0.3)(bert_layer["last_hidden_state"])
    output_layer = Dense(len(label_list), activation="softmax")(embedding_layer)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=[output_layer])
    
    model.compile(optimizer=Adam(learning_rate=0.00001), loss=SparseCategoricalCrossentropy(), metrics=[Accuracy()])
    
    return model

In [None]:
from transformers import TFBertModel

bert_model = TFBertModel.from_pretrained("bert-base-uncased")
model = instantiate_model(bert_model, max_sentence_length)

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

early_stopping_callback = EarlyStopping(mode="min", patience=5)

bert_history = model.fit([input_ids, attention_mask], np.array(train_labels),
                        validation_data=([val_input_ids, val_attention_mask], np.array(test_labels)),
                        epochs=25, batch_size=32,
                        callbacks=early_stopping_callback)