In [3]:
import numpy as np
import numpy.random as rand
import math
import random
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
import tensorflow as tf
from keras import backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, matthews_corrcoef
from torch.utils.data import TensorDataset, DataLoader
from itertools import chain
!pip install shap
import shap
from collections import Counter
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.over_sampling import SMOTEN, RandomOverSampler, ADASYN
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
plt.style.use('dark_background')



In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('Script is running with GPU')
else:
  device = torch.device("cpu")
  print('Script is running WITHOUT GPU')

In [4]:
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
df = pd.read_csv('orthodata.csv')

In [10]:
df.shape

(52447, 3)

In [11]:
df = df[df['words'].str.len() <= 10]
df

Unnamed: 0.1,Unnamed: 0,words,IPA
0,0,a,eI
1,1,a,eI
2,2,a,eI
3,3,A,eI
4,4,AA,eIeI
...,...,...,...
52442,52442,zoomlens,zumlEnz
52443,52443,zoophyte,z@U@faIt
52444,52444,zootsuit,zutsut
52445,52445,Zouave,zuAv


In [12]:
words = np.asarray(df['words'], dtype = 'str')
IPA = np.asarray(df['IPA'], dtype = 'str')

In [19]:
wordlist = [list(re.sub("[\\'\\,-\\./'~]", '', i).lower()) for i in words]
IPAlist = [list(re.sub("[\\'\\,-\\./'~]", '', i)) for i in IPA]

In [30]:
len(wordlist),len(IPAlist)

(40402, 40402)

In [26]:
words[-1],wordlist[-1]

('zucchini', ['z', 'u', 'c', 'c', 'h', 'i', 'n', 'i'])

In [28]:
uniques_words = sorted(set(char for sublist in wordlist for char in sublist))
uniques_ipa = sorted(set(char for sublist in IPAlist for char in sublist))
print(uniques_words)
print(uniques_ipa)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['3', '@', 'A', 'D', 'E', 'I', 'N', 'O', 'S', 'T', 'U', 'V', 'Z', 'a', 'b', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z', 'ɑ', 'ɞ']


In [38]:
wordlist[0]

['a']

In [29]:
def to_feature_matrix(data, uniques, inherit_val = False, maxlist = False):
  data = [[uniques.index(char) for char in sublist] for sublist in data]
  if inherit_val:
    maxval = max(len(sublist) for sublist in maxlist)
  else:
    maxval = max(len(sublist) for sublist in data)
  out = []
  for i in data:
    m = np.zeros((len(uniques), maxval))
    for jindex, j in enumerate(i):
      m[j, jindex] = 1

    out.append(m)
  return np.asarray(out)

In [32]:
data = to_feature_matrix(IPAlist, uniques_ipa)
labels = to_feature_matrix(wordlist, uniques_words)

In [33]:
len(uniques_words),len(uniques_ipa)

(26, 37)

In [46]:
newlabels = []
for i in labels:
  z = np.zeros(10)
  z[np.where(np.sum(i, axis=0) == 0)] = 1
  newlabels.append(np.vstack((i, z)))
newlabels = np.asarray(newlabels)

In [48]:
X_test, X_train, y_test, y_train = train_test_split(data, newlabels, test_size=0.8)
input_size = y_test.shape[1:3]
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(1)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [49]:
input_size

(27, 10)

In [50]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
    def fit(self, train_dl, val_dl, epochs):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        self.trainlosses = []
        self.vallosses = []
        for epoch in range(epochs):
            for data in train_dl:
                optimizer.zero_grad()
                X, y = data
                pred = self.forward(X)
                loss = self.loss_fn(pred[:,:,0], y[:,:,0])
                for i in range(pred.shape[2]):
                  loss += self.loss_fn(pred[:,:,i], y[:,:,i])
                loss.backward()
                optimizer.step()
            train_loss = self.evaluate(train_dl).item()
            val_loss = self.evaluate(val_dl).item()
            self.trainlosses.append(train_loss)
            self.vallosses.append(val_loss)
            print(f"Epoch [{epoch + 1}/{epochs}] - TrainLoss: {train_loss:.4f}, ValLoss: {val_loss:.4f}")

    def evaluate(self, val_dl):
        losses = []
        with torch.no_grad():
            for data in val_dl:
                X, y = data
                pred = self.forward(X)
                loss = self.loss_fn(pred, y)
                losses.append(loss)
        return torch.Tensor(losses).mean()

    def predict(self, test_dl):
        ys = []
        preds = []
        with torch.no_grad():
            for data in test_dl:
                X, y = data
                pred = self.forward(X)
                ys += y.detach()
                preds += pred.detach()

        return preds, ys


In [None]:
fc1_input_size = x.reshape(x.size(0), -1).shape[1]

In [55]:
class Network_CNN(Model):
    def __init__(self, input_size):
        super().__init__()
        self.cv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3)
        self.cv2 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=3)
        self.fc1 = nn.Linear(19008, 512)
        self.fc2 = nn.Linear(512, input_size[0]*input_size[1])

        self.relu = nn.ReLU()
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x):
        x = self.relu(self.cv1(x))
        x = self.relu(self.cv2(x))
        x = x.reshape(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        x = x.reshape(x.shape[0], input_size[0],input_size[1])

        return x

In [56]:
network_cnn = Network_CNN(input_size)
network_cnn.fit(train_dl=train_loader, val_dl=test_loader, epochs=6)

Epoch [1/6] - TrainLoss: 0.8182, ValLoss: 0.9199
Epoch [2/6] - TrainLoss: 0.6780, ValLoss: 0.8350
Epoch [3/6] - TrainLoss: 0.5935, ValLoss: 0.8037
Epoch [4/6] - TrainLoss: 0.5205, ValLoss: 0.7944
Epoch [5/6] - TrainLoss: 0.4695, ValLoss: 0.8118
Epoch [6/6] - TrainLoss: 0.4391, ValLoss: 0.8556


In [70]:
def wordpred(ipa, model = network_cnn):
  m = to_feature_matrix(ipa, uniques_ipa, True, IPAlist)
  dl = DataLoader(TensorDataset(torch.tensor(m, dtype=torch.float32).unsqueeze(1), torch.tensor(m, dtype=torch.float32).unsqueeze(1)), batch_size=1)
  p, y = model.predict(dl)
  res = []
  for idx, i in enumerate(p):
    r = torch.argmax(i, dim=0)[:10]
    r2 = [uniques_words[x] for l, x in enumerate(r) if l < r.tolist().index(26)]

    res.append(r2)
    print("".join(r2))
  return res

In [71]:
wordpred(["@beIs","ɑbsInT","krEdIt","E@r@nOt","E@h@Ul"])

obase
absinue
creddt
airenott
airhll


[['o', 'b', 'a', 's', 'e'],
 ['a', 'b', 's', 'i', 'n', 'u', 'e'],
 ['c', 'r', 'e', 'd', 'd', 't'],
 ['a', 'i', 'r', 'e', 'n', 'o', 't', 't'],
 ['a', 'i', 'r', 'h', 'l', 'l']]