In [None]:
import collections
import os
import re
import torch
import torch.nn as nn
import random
import numpy as np

lr = 0.003
wdlen = 6
lang_list = ['uk', 'la','it']

class PredictLanguage(nn.Module):
    def __init__(self, lang_list):
        super(PredictLanguage, self).__init__()
        self.num_to_train = 23000
        self.lang_list = lang_list
        self.lang2ix = {key: i for (i, key) in enumerate(self.lang_list)}

    def create_wordlists(self):
        self.wordlist_dict = collections.defaultdict(lambda: [])
        for lang in self.lang_list:
            if os.path.isfile(lang+'_len'+str(wdlen)+'.txt'):
                with open(lang+'_len'+str(wdlen)+'.txt') as f0:
                    for wd in f0.readlines():
                        wd = wd.rstrip()
                        #print(wd)
                        self.wordlist_dict[lang].append(wd.lower())
                print('Wordlist of words of length', wdlen, 'successfully created for language', lang)
            else:
                print('Language', lang, 'with path', lang+'_len'+str(wdlen)+'.txt', 'not found!')

    def get_phoneme_list(self):
        self.phonemes = []
        for lang in list(self.wordlist_dict):
            for wd in self.wordlist_dict[lang]:
                for phoneme in wd:
                    if phoneme not in self.phonemes:
                        self.phonemes.append(phoneme)
        print('There are', len(self.phonemes), 'phonemes across the', len(lang_list), 'languages')
        self.phoneme2ix = {}
        for i, phoneme in enumerate(self.phonemes):
            self.phoneme2ix[phoneme] = torch.tensor(i)

    def create_embeddings_for_phonemes(self):
        self.phoneme_embeddings = nn.Embedding(len(self.phonemes),len(self.phonemes))

    def set_up_model(self):
        self.dim_hid1 = 32
        self.dim_hid2 = 32
        self.input2hidden1 = nn.Linear(len(self.phonemes)*wdlen, self.dim_hid1)
        self.hidden1hidden2 = nn.Linear(self.dim_hid1, self.dim_hid2)
        self.hidden2output = nn.Linear(self.dim_hid2, len(lang_list))
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_word):
        phoneme_vectors = []
        for phoneme in input_word:
            phoneme_vectors.append(self.phoneme_embeddings(self.phoneme2ix[phoneme]))
        vector_for_word = torch.unsqueeze(torch.cat(phoneme_vectors, dim=0), 0)
        #print('vfw', vector_for_word.size())
        hid1 = self.input2hidden1(vector_for_word)
        hid1 = self.sigmoid(hid1)
        hid2 = self.hidden1hidden2(hid1)
        hid2 = self.sigmoid(hid2)
        return self.hidden2output(hid2)



In [None]:

def train(model):
    criterion = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(model.parameters(), lr=lr)
    num_tested = 0
    num_correct = 0
    ep_loss = 0
    #print('ll', model.lang_list)
    for i in range(model.num_to_train):
        lang = random.choice(lang_list)
        #print('lang', lang)
        #print(model.wordlist_dict[lang])
        wd = random.choice(model.wordlist_dict[lang])
        #print(lang, wd)
        num_tested += 1
        pred = model(wd)
        #print('pli', pred)
        target_lang_ix = model.lang2ix[lang]
        target = torch.unsqueeze(torch.tensor(target_lang_ix), 0)
        with torch.no_grad():
            pred_numpy = np.argmax(pred.numpy(), axis=1).tolist()[0]
            if pred_numpy == target_lang_ix:
                num_correct += 1
        loss = criterion(pred, target)
        loss.backward()
        optimiser.step()
        optimiser.zero_grad()
        ep_loss += loss.detach()
        if i % 1000 == 0 and i >0:
            average_loss = ep_loss / 1000
            ep_loss = 0
            print(i, average_loss, 'training accuracy on last 1000 examples', round(num_correct / num_tested, 4))
            num_tested = 0
            num_correct = 0

In [None]:
model = PredictLanguage(lang_list)
model.create_wordlists()
model.get_phoneme_list()
model.create_embeddings_for_phonemes()
model.set_up_model()
train(model)


#with torch.no_grad():
#    embeddings = model.phoneme_embeddings.weight.numpy()
#    #print(embeddings)
#    print(embeddings.shape)
#    labels = model.phonemes

#/Y = tsne(embeddings, 2, 50, 5.0)


Wordlist of words of length 6 successfully created for language uk
Wordlist of words of length 6 successfully created for language la
Wordlist of words of length 6 successfully created for language it
There are 26 phonemes across the 3 languages
1000 tensor(0.7661) training accuracy on last 1000 examples 0.6593
2000 tensor(0.5166) training accuracy on last 1000 examples 0.806
3000 tensor(0.5304) training accuracy on last 1000 examples 0.787
4000 tensor(0.4585) training accuracy on last 1000 examples 0.836
5000 tensor(0.4691) training accuracy on last 1000 examples 0.822
6000 tensor(0.4481) training accuracy on last 1000 examples 0.822
7000 tensor(0.3993) training accuracy on last 1000 examples 0.845
8000 tensor(0.4116) training accuracy on last 1000 examples 0.84
9000 tensor(0.3473) training accuracy on last 1000 examples 0.864
10000 tensor(0.3432) training accuracy on last 1000 examples 0.869
11000 tensor(0.3631) training accuracy on last 1000 examples 0.851
12000 tensor(0.3413) train

In [None]:

while True:
    test_wd = input('Type a word of '+str(wdlen)+' letters to see what language the model predicts for it.\n')
    if len(test_wd) != wdlen:
        print('Your word does not have', wdlen, 'letters!')
    else:
        break

with torch.no_grad():
    pred = model(test_wd.lower())
    pred_numpy = np.argmax(pred.numpy(), axis=1).tolist()[0]
    print('Prediction is', lang_list[pred_numpy])

Type a word of 6 letters to see what language the model predicts for it.
canine
Prediction is it
