In [1]:
import os
train_data = open(os.path.join('train_test_data', 'train_set.txt'), 'r').read().splitlines()
test_data = open(os.path.join('train_test_data', 'test_set.txt'), 'r').read().splitlines()
tot_data = open(os.path.join('train_test_data', 'tot_set.txt'), 'r').read().splitlines()


In [2]:
from alignment_methods import *

english_model = Word2Vec.load(os.path.join('data', 'english_model_lemmatized'))
german_model = Word2Vec.load(os.path.join('data', 'german_model_lemmatized'))

english_embeddings, (english_idx, english_iidx) = w2v_to_numpy(english_model)
german_embeddings, (german_idx, german_iidx) = w2v_to_numpy(german_model)

In [3]:
# print size of vocab
print(len(english_embeddings))
print(len(german_embeddings))

26019
60960


In [4]:
# get random 50 samples from the training data
import random
random.seed(42)
train_data_trunc = random.sample(train_data, 50)

In [5]:
en_words, de_words, en_indices, de_indices = create_words_and_indices(train_data_trunc)
en_test_words, de_test_words, en_test_indices, de_test_indices = create_words_and_indices(test_data)

en_train_matrix = create_matrix_slice(english_embeddings, en_indices)
de_train_matrix = create_matrix_slice(german_embeddings, de_indices)

In [6]:
X = direct_alignment(de_train_matrix, en_train_matrix)
X = procrustes(de_train_matrix, en_train_matrix)

In [7]:
de_aligned = german_embeddings @ X

print('Direct Alignment')

## Training Accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_indices, de_indices, english_iidx, 1)
print('top 1 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

# top five accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_indices, de_indices, english_iidx, 5)
print('top 5 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

### Testing Accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_test_indices, de_test_indices, english_iidx, 1)
print('top 1 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

# top five accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_test_indices, de_test_indices, english_iidx, 5)
print('top 5 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")


Direct Alignment
top 1 accuracy
Accuracy: 0.98, Total: 50
top 5 accuracy
Accuracy: 1.0, Total: 50
top 1 accuracy
Accuracy: 0.2785388127853881, Total: 219
top 5 accuracy
Accuracy: 0.5251141552511416, Total: 219


In [8]:
formatted_train_data = []
for item in train_data:
    i = item.split()
    formatted_train_data.append((i[0], i[1], int(i[2]), int(i[3])))
    

In [9]:
ordered = sorted(formatted_train_data, key=lambda x: abs(x[2] - x[3]))

In [10]:
ordered[:5]

[('mr', 'herr', 1, 1),
 ('commission', 'kommission', 2, 2),
 ('european', 'europäisch', 0, 0),
 ('june', 'juni', 808, 808),
 ('criterion', 'kriterium', 546, 546)]

In [11]:
# save ordered
with open('train_test_data/ordered_by_idff.txt', 'w') as f:
    for item in ordered:
        f.write(f"{item[0]} {item[1]} {item[2]} {item[3]}\n")

In [12]:
ordered_train = ordered[:100]

In [13]:
formatted_train_data = []
for item in train_data:
    i = item.split()
    formatted_train_data.append((i[0], i[1], int(i[2]), int(i[3])))
    
    

In [14]:
ordered_by_freq = sorted(formatted_train_data, key=lambda x: abs(x[2] + x[3]))


In [15]:
with open('train_test_data/ordered_by_freq.txt', 'w') as f:
    for item in ordered_by_freq:
        f.write(f"{item[0]} {item[1]} {item[2]} {item[3]}\n")

In [16]:
en_words = []
de_words = []
en_indices = []
de_indices = []
for i in range(len(ordered_train)):
    en_words.append(ordered_train[i][0])
    de_words.append(ordered_train[i][1])
    en_indices.append(ordered_train[i][2])
    de_indices.append(ordered_train[i][3])
    

In [17]:
ordered_by_freq = ordered_by_freq[:50]

In [18]:
en_words = []
de_words = []
en_indices = []
de_indices = []
for i in range(len(ordered_by_freq)):
    en_words.append(ordered_by_freq[i][0])
    de_words.append(ordered_by_freq[i][1])
    en_indices.append(ordered_by_freq[i][2])
    de_indices.append(ordered_by_freq[i][3])

In [19]:
en_train_matrix = create_matrix_slice(english_embeddings, en_indices)
de_train_matrix = create_matrix_slice(german_embeddings, de_indices)

In [20]:
print(len(en_indices))

50


In [21]:
#X = direct_alignment(de_train_matrix, en_train_matrix)
X = procrustes(de_train_matrix, en_train_matrix)
#X = closed_form_linear_regression(de_train_matrix, en_train_matrix)

In [22]:
print(len(en_test_indices))

219


In [23]:
de_aligned = german_embeddings @ X

print('Direct Alignment')

## Training Accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_indices, de_indices, english_iidx, 1)
print('top 1 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

# top five accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_indices, de_indices, english_iidx, 5)
print('top 5 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

### Testing Accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_test_indices, de_test_indices, english_iidx, 1)
print('top 1 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

# top five accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_test_indices, de_test_indices, english_iidx, 5)
print('top 5 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")


Direct Alignment
top 1 accuracy
Accuracy: 0.96, Total: 50
top 5 accuracy
Accuracy: 1.0, Total: 50
top 1 accuracy
Accuracy: 0.3242009132420091, Total: 219
top 5 accuracy
Accuracy: 0.5388127853881278, Total: 219


In [24]:
sum_indices = [x + y for x, y in zip(en_indices, de_indices)]
max_val = max(sum_indices)
min_val = min(sum_indices)
weight = [2 -(x - min_val)/(max_val - min_val) for x in sum_indices]
print(weight)

[2.0, 1.9907407407407407, 1.9814814814814814, 1.9490740740740742, 1.9398148148148149, 1.925925925925926, 1.8981481481481481, 1.8472222222222223, 1.8194444444444444, 1.787037037037037, 1.7731481481481481, 1.7268518518518519, 1.6898148148148149, 1.6574074074074074, 1.6527777777777777, 1.6203703703703702, 1.5972222222222223, 1.574074074074074, 1.574074074074074, 1.5694444444444444, 1.5324074074074074, 1.4953703703703702, 1.4907407407407407, 1.4722222222222223, 1.449074074074074, 1.449074074074074, 1.4305555555555556, 1.4305555555555556, 1.4074074074074074, 1.3935185185185186, 1.3472222222222223, 1.3333333333333335, 1.3148148148148149, 1.3101851851851851, 1.3101851851851851, 1.2638888888888888, 1.25, 1.2453703703703702, 1.2453703703703702, 1.1898148148148149, 1.1574074074074074, 1.1435185185185186, 1.1435185185185186, 1.0925925925925926, 1.0925925925925926, 1.0648148148148149, 1.0277777777777777, 1.0185185185185186, 1.0138888888888888, 1.0]


In [29]:
import torch
import torch.nn as nn
import torch.optim as optim

# Data
X = torch.tensor(de_train_matrix, dtype=torch.float32)
Y = torch.tensor(en_train_matrix, dtype=torch.float32)
weights = torch.tensor(weight, dtype=torch.float32)

# Model
model = nn.Linear(de_train_matrix.shape[1], en_train_matrix.shape[1])

# Loss Function
def weighted_mse_loss(input, target, weight):
    print(input.shape)
    print(target.shape)
    print(weight.shape)
    return torch.sum(weight * (input - target) ** 2)

# Optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    predictions = model(X)
    
    # Compute weighted loss
    loss = weighted_mse_loss(predictions, Y, weights)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 1000 == 0:
        
        print(f'Epoch {epoch+1}: Loss = {loss.item()}')
        #print accuracy
        de_aligned = german_embeddings @ model.weight.detach().numpy()
    

#print model weights
print(model.weight)

torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
torch.Size([50, 50])
torch.Size([50, 50])
torch.Size([50])
Parameter containing:
tensor([[-0.0635,  0.1146, -0.0910,  ..., -0.0178, -0.0756,  0.0119],
        [ 0.0273, -0.0472,  0.0826,  ...,  0.0219, -0.0307, -0.0719],
        [ 0.1087, -0.0239, -0.0850,  ...,  0.0507, -0.0951, -0.0804],
        ...,
        [ 0.0841,  0.1327, -0.0043,  ..., -0.0897, -0.0919,  0.0975],
        [ 0.0843, -0.0717,  0.0654,  ...,  0.1299, -0.0024, -0.0802],
        [ 0.0161,  0.1240

In [26]:
from utils import experiment_setup

experiment_setup('data/english_model_lemmatized', 'data/german_model_lemmatized', 'train_test_data/train_set.txt', 'train_test_data/test_set.txt', alignment_method='w')


RuntimeError: The size of tensor a (875) must match the size of tensor b (50) at non-singleton dimension 1

In [27]:
print(model(X))
x_numpy = model(X).detach().numpy()

tensor([[-0.1175,  0.1433, -0.0080,  ...,  0.0852, -0.0720,  0.0635],
        [-0.1274, -0.1894,  0.1919,  ...,  0.2766,  0.1149, -0.0833],
        [-0.1934,  0.0564,  0.0264,  ..., -0.1313, -0.0798, -0.2504],
        ...,
        [-0.0451,  0.0143,  0.0526,  ...,  0.1197, -0.0095, -0.0868],
        [-0.0021, -0.0072, -0.0559,  ...,  0.0232, -0.0808,  0.0088],
        [ 0.0638,  0.0091, -0.1604,  ..., -0.2108, -0.0061, -0.2440]],
       grad_fn=<AddmmBackward0>)


In [28]:
x_numpy.shape

(50, 50)

In [None]:
de_aligned = model(torch.tensor(german_embeddings, dtype=torch.float32)).detach().numpy()

print('Direct Alignment')

## Training Accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_indices, de_indices, english_iidx, 1)
print('top 1 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

# top five accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_indices, de_indices, english_iidx, 5)
print('top 5 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

### Testing Accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_test_indices, de_test_indices, english_iidx, 1)
print('top 1 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")

# top five accuracy
accuracy, total = get_accuracy_scores(english_embeddings, de_aligned, en_test_indices, de_test_indices, english_iidx, 5)
print('top 5 accuracy')
print(f"Accuracy: {accuracy}, Total: {total}")
