In [1]:
import pandas as pd
import gensim
import spacy
from spacy.vocab import Vocab
from spacy.tokens import DocBin, Doc
from spacy.training import Example
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
# Load FastText model
ft_model = gensim.models.KeyedVectors.load("../embeddings/sumerian_fasttext.model")

In [None]:
class SumerianLanguage(spacy.Language):
    lang = "sux"

spacy.util.registry.languages.register("sux", func=SumerianLanguage)
nlp = spacy.blank("sux")

def add_fasttext_vectors(nlp, ft_model):
    nlp.vocab = Vocab()
    for word, idx in ft_model.wv.key_to_index.items():
        if isinstance(word, str) and word not in nlp.vocab:
            vec = ft_model.wv.get_vector(word)
            nlp.vocab.set_vector(word, vec)

In [4]:
add_fasttext_vectors(nlp, ft_model)
config = {
    "min_action_freq": 1
}
if "parser" not in nlp.pipe_names:
    dep_parser = nlp.add_pipe("parser", config=config)

In [None]:
extra_data = pd.read_csv("../UD-ETCSUX/extra_data.csv").dropna(subset=['sentence_id'])
sumerian = []
heads = []
deps = []
added_data = []
sent_starts = []
previous_page = -1

for idx, row in extra_data.iterrows():
    if previous_page != row.sentence_id:
        if len(sumerian) > 0:
            print(sumerian)
            sux_text = " ".join(sumerian)
            sux_dict = {"heads": heads, "deps": deps, "sent_starts": sent_starts}
            added_data.append((sux_text, sux_dict))
            sumerian = []
            heads = []
            deps = []
            sent_starts = []
        sumerian.append(row.form)
        heads.append(row["head"] - 1)
        deps.append(row.dependency)
        sent_starts.append(1)
        previous_page = row.sentence_id
    else:
        sumerian.append(row.form)
        heads.append(row["head"] - 1)
        deps.append(row.dependency)
        sent_starts.append(0)

for idx, item in enumerate(added_data):
    text, label = item
    new_item = (text, text.split(" "), label)
    added_data[idx] = new_item

In [None]:
# Load data
all_data = pd.read_csv("../UD-ETCSUX/UD-ETCSUX.csv").dropna(subset=['sentence_id'])
sumerian = []
heads = []
deps = []
train_data = []
sent_starts = []
previous_page = -1

for idx, row in all_data.iterrows():
    if previous_page != row.sentence_id:
        if len(sumerian) > 0:
            print(sumerian)
            sux_text = " ".join(sumerian)
            sux_dict = {"heads": heads, "deps": deps, "sent_starts": sent_starts}
            train_data.append((sux_text, sux_dict))
            sumerian = []
            heads = []
            deps = []
            sent_starts = []
        print(row.form)
        sumerian.append(row.form)
        heads.append(row["head"] - 1)
        deps.append(row.dependency)
        sent_starts.append(1)
        previous_page = row.sentence_id
    else:
        sumerian.append(row.form)
        heads.append(row["head"] - 1)
        deps.append(row.dependency)
        sent_starts.append(0)

for idx, item in enumerate(train_data):
    text, label = item
    new_item = (text, text.split(" "), label)
    train_data[idx] = new_item


In [7]:
# Add labels from your dataset
for text, tokens, annotations in train_data:
    for dep in annotations.get("deps", []):
        dep_parser.add_label(str(dep))

In [8]:
for text, tokens, annotations in added_data:
    for dep in annotations.get("deps", []):
        dep_parser.add_label(str(dep))

In [None]:
kf = KFold(n_splits=10)
uas_scores = []
las_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(train_data)):
    print(f"Starting fold {fold + 1}")
    train_fold = [train_data[i] for i in train_idx]
    test_fold = [train_data[i] for i in test_idx]

    # Create training examples
    train_examples = []
    for text, tokens, annotations in train_fold:
        doc = Doc(nlp.vocab, words=tokens)
        heads = annotations['heads']
        deps = annotations['deps']
        train_examples.append(Example.from_dict(doc, {'heads': heads, 'deps': deps}))
        
    ### Uncomment this to include agumented data
    # for text, tokens, annotations in added_data:
    #     doc = Doc(nlp.vocab, words=tokens)
    #     heads = annotations['heads']
    #     deps = annotations['deps']
    #     train_examples.append(Example.from_dict(doc, {'heads': heads, 'deps': deps}))

    # Train the model
    nlp.initialize()
    optimizer = nlp.begin_training()

    for i in range(20):  # Adjust the number of iterations as needed
        losses = {}
        batches = spacy.util.minibatch(train_examples, size=12)
        for idx, batch in enumerate(batches):
            try:
                nlp.update(batch, drop=0.8, sgd=optimizer, losses=losses)
            except Exception as e:
                print("find: {}".format(idx+1))
                continue
        print(f"Losses at iteration {i}: {losses}")

    # Evaluate the model
    uas = 0
    las = 0
    total = 0

    for text, tokens, annotations in test_fold:
        doc = Doc(nlp.vocab, words=tokens)
        doc = nlp(doc)  # Apply the model to get predicted heads and deps
        predicted_heads = [token.head.i for token in doc]
        predicted_deps = [token.dep_ for token in doc]
        true_heads = annotations['heads']
        true_deps = annotations['deps']
        for p_head, t_head, p_dep, t_dep in zip(predicted_heads, true_heads, predicted_deps, true_deps):
            if p_head == t_head:
                uas += 1
            if p_head == t_head and p_dep == t_dep:
                las += 1
            total += 1

    uas_score = uas / total
    las_score = las / total

    uas_scores.append(uas_score)
    las_scores.append(las_score)

    print(f"Fold {fold + 1} - UAS: {uas_score:.4f}, LAS: {las_score:.4f}")

print(f"Average UAS: {sum(uas_scores) / len(uas_scores):.4f}")
print(f"Average LAS: {sum(las_scores) / len(las_scores):.4f}")