In [41]:
!pip install pandas transformers torch



In [42]:
import pandas as pd

# Download the file directly from the UniMorph GitHub
!wget -O eng_unimorph.tsv https://raw.githubusercontent.com/unimorph/eng/master/eng

# Load into a DataFrame
df = pd.read_csv("eng_unimorph.tsv", sep="\t", header=None, names=["lemma", "form", "features"])
print(df.head())


--2025-06-12 21:31:52--  https://raw.githubusercontent.com/unimorph/eng/master/eng
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18022905 (17M) [text/plain]
Saving to: ‘eng_unimorph.tsv’


2025-06-12 21:31:53 (79.8 MB/s) - ‘eng_unimorph.tsv’ saved [18022905/18022905]

       lemma         form      features
0  microtome   microtomes          N;PL
1  microtome   microtomes    V;PRS;3;SG
2  microtome  microtoming  V;V.PTCP;PRS
3  microtome   microtomed         V;PST
4  microtome   microtomed  V;V.PTCP;PST


In [43]:
# Drop missing values and duplicates
df = df.dropna().drop_duplicates()

# Keep only rows where both lemma and form are alphabetic
df = df[df['lemma'].str.isalpha() & df['form'].str.isalpha()]

# Lowercase
df['lemma'] = df['lemma'].str.lower()
df['form'] = df['form'].str.lower()

In [44]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def is_single_token(word):
    ids = tokenizer.encode(word, add_special_tokens=False)
    return len(ids) == 1 and word in tokenizer.get_vocab()

# Batch process lemmas and forms
lemmas = df['lemma'].tolist()
forms = df['form'].tolist()

lemma_encodings = tokenizer.batch_encode_plus(lemmas, add_special_tokens=False)['input_ids']
form_encodings = tokenizer.batch_encode_plus(forms, add_special_tokens=False)['input_ids']

# Both lemma and form must be a single token and not equal
mask = [
    len(l) == 1 and len(f) == 1 and lemmas[i] != forms[i]
    for i, (l, f) in enumerate(zip(lemma_encodings, form_encodings))
]

filtered_df = df[mask]
print(f"Filtered pairs: {len(filtered_df)}")
print(filtered_df.head(10))

Filtered pairs: 1077
      lemma     form      features
6       eat   eating  V;V.PTCP;PRS
93     mile    miles          N;PL
354    lead  leading  V;V.PTCP;PRS
799   fillo   fillos          N;PL
1800  serve   served         V;PST
1801  serve   served  V;V.PTCP;PST
2157   city   cities          N;PL
2583   dure   during  V;V.PTCP;PRS
2695  belle   belles          N;PL
2713  place   places          N;PL


In [45]:
filtered_df.to_csv("filtered_unimorph_pairs.csv", index=False)

In [46]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
model = AutoModel.from_pretrained("xlm-roberta-base")
model.eval()  # Set model to evaluation mode

# Get the embedding matrix
embedding_matrix = model.get_input_embeddings().weight.data  # shape: (vocab_size, hidden_dim)

In [47]:
def get_token_id(word):
    # Returns the token id if word is a single token, else None
    ids = tokenizer.encode(word, add_special_tokens=False)
    return ids[0] if len(ids) == 1 else None

# Add token ids to DataFrame (should already be single tokens from filtering)
filtered_df['lemma_id'] = filtered_df['lemma'].apply(get_token_id)
filtered_df['form_id'] = filtered_df['form'].apply(get_token_id)

# Drop any rows where tokenization failed (should be none, but just in case)
filtered_df = filtered_df.dropna(subset=['lemma_id', 'form_id'])
filtered_df['lemma_id'] = filtered_df['lemma_id'].astype(int)
filtered_df['form_id'] = filtered_df['form_id'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['lemma_id'] = filtered_df['lemma'].apply(get_token_id)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['form_id'] = filtered_df['form'].apply(get_token_id)


In [48]:
# Convert to torch tensor for indexing
lemma_ids = torch.tensor(filtered_df['lemma_id'].values)
form_ids = torch.tensor(filtered_df['form_id'].values)

# Get embeddings
lemma_embeddings = embedding_matrix[lemma_ids]
form_embeddings = embedding_matrix[form_ids]

In [49]:
import numpy as np

np.save("lemma_embeddings.npy", lemma_embeddings.cpu().numpy())
np.save("form_embeddings.npy", form_embeddings.cpu().numpy())
filtered_df.to_csv("embedding_pairs_metadata.csv", index=False)

In [50]:
print("Lemma embeddings shape:", lemma_embeddings.shape)
print("Form embeddings shape:", form_embeddings.shape)
print("Sample lemma embedding:", lemma_embeddings[0][:5])

Lemma embeddings shape: torch.Size([1077, 768])
Form embeddings shape: torch.Size([1077, 768])
Sample lemma embedding: tensor([0.0729, 0.1250, 0.2510, 0.4988, 0.0096])


In [51]:
import torch

lemma_embeddings = torch.tensor(np.load("lemma_embeddings.npy"))
form_embeddings = torch.tensor(np.load("form_embeddings.npy"))

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    lemma_embeddings, form_embeddings, test_size=0.2, random_state=42
)

In [56]:
import torch.nn as nn
import torch.optim as optim

class MLPMapper(nn.Module):
    def __init__(self, dim, hidden=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, dim)
        )
    def forward(self, x):
        return self.net(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLPMapper(X_train.shape[1]).to(device)
X_train, y_train = X_train.to(device), y_train.to(device)

optimizer = optim.Adam(model.parameters(), lr=5e-4)
loss_fn = nn.MSELoss()

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    pred = model(X_train)
    loss = loss_fn(pred, y_train)
    loss.backward()
    optimizer.step()
    if (epoch+1) % 20 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 20, Loss: 0.0287
Epoch 40, Loss: 0.0256
Epoch 60, Loss: 0.0229
Epoch 80, Loss: 0.0202
Epoch 100, Loss: 0.0178
Epoch 120, Loss: 0.0158
Epoch 140, Loss: 0.0142
Epoch 160, Loss: 0.0129
Epoch 180, Loss: 0.0118
Epoch 200, Loss: 0.0109
Epoch 220, Loss: 0.0101
Epoch 240, Loss: 0.0095
Epoch 260, Loss: 0.0089
Epoch 280, Loss: 0.0084
Epoch 300, Loss: 0.0080
Epoch 320, Loss: 0.0076
Epoch 340, Loss: 0.0072
Epoch 360, Loss: 0.0069
Epoch 380, Loss: 0.0067
Epoch 400, Loss: 0.0064
Epoch 420, Loss: 0.0062
Epoch 440, Loss: 0.0060
Epoch 460, Loss: 0.0058
Epoch 480, Loss: 0.0057
Epoch 500, Loss: 0.0055
Epoch 520, Loss: 0.0054
Epoch 540, Loss: 0.0053
Epoch 560, Loss: 0.0051
Epoch 580, Loss: 0.0050
Epoch 600, Loss: 0.0049
Epoch 620, Loss: 0.0049
Epoch 640, Loss: 0.0048
Epoch 660, Loss: 0.0047
Epoch 680, Loss: 0.0046
Epoch 700, Loss: 0.0046
Epoch 720, Loss: 0.0045
Epoch 740, Loss: 0.0045
Epoch 760, Loss: 0.0044
Epoch 780, Loss: 0.0044
Epoch 800, Loss: 0.0043
Epoch 820, Loss: 0.0043
Epoch 840, Loss: 0.0

In [57]:
import torch.nn.functional as F

model.eval()
X_test, y_test = X_test.to(device), y_test.to(device)
with torch.no_grad():
    pred = model(X_test)
    cosine_sim = F.cosine_similarity(pred, y_test, dim=1)
    avg_cosine = cosine_sim.mean().item()
print(f"Average Cosine Similarity on Test Set: {avg_cosine:.4f}")


Average Cosine Similarity on Test Set: 0.7494


In [58]:
def top_k_accuracy(preds, targets, k=5):
    # preds, targets: (N, D)
    sims = torch.mm(preds, targets.t())
    topk = sims.topk(k, dim=1).indices
    correct = torch.arange(preds.size(0), device=preds.device)
    hits = [(correct[i] in topk[i]) for i in range(preds.size(0))]
    return sum(hits) / len(hits)

with torch.no_grad():
    acc = top_k_accuracy(pred, y_test, k=5)
print(f"Top-5 Nearest Neighbor Accuracy: {acc:.2%}")


Top-5 Nearest Neighbor Accuracy: 89.35%
