# morphemic analysis

In [10]:
!pip install TorchCRF



In [1]:
import pandas as pd
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
dataframe = pd.read_csv("/kaggle/input/morpheme/morpheme_train.csv", names=["word", "morph"])
dataframe.head()

Unnamed: 0,word,morph
0,обескровленный,о:PREF/бес:PREF/кровл:ROOT/енн:SUFF/ый:END
1,подбелка,под:PREF/бел:ROOT/к:SUFF/а:END
2,якутянин,якут:ROOT/ян:SUFF/ин:SUFF
3,скомкиваться,с:PREF/ком:ROOT/к:PREF/ива:SUFF/ть:END/ся:POST
4,приоткрыться,при:PREF/от:PREF/кры:ROOT/ть:END/ся:POST


In [4]:
targets = ["ROOT", "PREF", "SUFF", "END", "POST", "LINK", "HYPH", "O"]
uniq_chars = set()

for word in dataframe['word']:
    uniq = set([ch for ch in word])
    uniq_chars = uniq_chars | uniq

vocab_size = len(uniq_chars) + 1
vocab_size

34

In [5]:
target_tokenizer = {"O": 0} # padding

for i, targ in enumerate(targets[:-1]):
    target_tokenizer["B-" + targ] = i * 2 + 1
    target_tokenizer["I-" + targ] = i * 2 + 2

In [6]:
tokenizer = {token : i + 1 for i, token in enumerate(uniq_chars)} # 0 - padding

In [7]:
def split_morpheme(word):
    parts = word.split('/')
    splited = []
    tags = []
    for part in parts:
        begin = 1
        symbols, mask = part.split(':')
        for char in symbols:
            if begin:
                tags.append(target_tokenizer["B-" + mask])                
                begin = 0
            else:
                tags.append(target_tokenizer["I-" + mask])
            if char == 'e':
                char = 'е'
            splited.append(tokenizer[char])
    return splited, tags

In [8]:
split_morpheme(dataframe["morph"][0])

([4, 23, 10, 14, 19, 29, 4, 3, 27, 10, 33, 33, 16, 12],
 [3, 3, 4, 4, 1, 2, 2, 2, 2, 5, 6, 6, 7, 8])

In [9]:
import torch
from torch import nn
from TorchCRF import CRF

class MorphModel(nn.Module):
    def __init__(self, vocab_size, target_size, layers=1, emb_size=50, hid_size=100):
        super(MorphModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.biLSTM = nn.LSTM(emb_size, hid_size, num_layers=layers, bidirectional=True, batch_first=True)
        self.hid2target = nn.Linear(hid_size * 2, target_size)
        self.crf = CRF(target_size)

    def forward(self, x, tags=None, mask=None):
        emb = self.embedding(x)
        lstm, _ = self.biLSTM(emb)
        emiss = self.hid2target(lstm)

        if tags != None:
            loss = -self.crf(emiss, tags, mask=mask)
            return loss
        else:
            morph = self.crf.viterbi_decode(emiss, mask=mask)
            return morph

model = MorphModel(vocab_size, len(target_tokenizer), layers=8).to(device)

In [207]:
from torchinfo import summary

summary(model, (1, 5), dtypes=[torch.long])

Layer (type:depth-idx)                   Output Shape              Param #
MorphModel                               --                        255
├─Embedding: 1-1                         [1, 5, 50]                1,700
├─LSTM: 1-2                              [1, 5, 200]               121,600
├─Linear: 1-3                            [1, 5, 15]                3,015
Total params: 126,570
Trainable params: 126,570
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.61
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.51
Estimated Total Size (MB): 0.52

In [10]:
from sklearn.model_selection import train_test_split

train_dataframe, valid_dataframe = train_test_split(dataframe, test_size=0.2, shuffle=True)
valid_dataframe = valid_dataframe.reset_index(drop=True)

In [11]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence


class TrainDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe.reset_index(drop=True)
        self.max_length = len(max(dataframe["word"], key=lambda x: len(x)))
        
    def __getitem__(self, idx):
        letters, tags = split_morpheme(self.df["morph"][idx])
        mask = [1 for i in range(len(letters))]

        letters = torch.tensor(letters, dtype=torch.long)
        tags = torch.tensor(tags, dtype=torch.long)
        mask = torch.tensor(mask, dtype=torch.bool)
        
        return letters, tags, mask

    def __len__(self):
        return len(self.df)

def collate_fn(batch):
    tokens, labels, masks = zip(*batch)
    
    tokens = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=target_tokenizer["O"])
    masks = pad_sequence(masks, batch_first=True, padding_value=0)
    return tokens, labels, masks


train_dataset = TrainDataset(train_dataframe)

dataloader = DataLoader(train_dataset, 
                        batch_size=16, 
                        collate_fn=collate_fn, 
                        shuffle=True, 
                        num_workers=4)

In [None]:
from torch import optim

optimizator = optim.Adam(model.parameters(), lr=0.001)
epoch = 10

for ep in range(epoch):
    sum_loss = 0
    model.train()
    for iteration, batch in enumerate(dataloader):
        letters, tags, mask = batch
        letters = letters.to(device)
        tags = tags.to(device)
        mask = mask.to(device)
        loss = model(letters, tags, mask=mask).sum()
        sum_loss += loss.item()/16
        
        if iteration % 100 == 0:
            print(f"iter {iteration}/{len(dataloader)}: {loss.item()/16}")
        
        loss.backward()
        optimizator.step()
        optimizator.zero_grad()

    model.eval()
    valid_sum = 0
    for i in range(len(valid_dataframe)):
        letters, tags = split_morpheme(valid_dataframe["morph"][i])
        mask = torch.tensor([1 for i in range(len(letters))], dtype=torch.bool).to(device).unsqueeze(0)
        letters = torch.tensor(letters, dtype=torch.long).to(device).unsqueeze(0)
        tags = torch.tensor(tags, dtype=torch.long).to(device).unsqueeze(0)

        valid_sum += model(letters, tags, mask).item()
    print(f"Epoch {ep}, Train: {sum_loss/len(dataloader)}, Validation: {valid_sum/len(valid_dataframe)}")

iter 0/2320: 23.922388076782227
iter 100/2320: 9.723794937133789
iter 200/2320: 6.5184783935546875
iter 300/2320: 6.271370887756348
iter 400/2320: 4.901465892791748
iter 500/2320: 5.170162677764893
iter 600/2320: 5.366547107696533
iter 700/2320: 3.5657942295074463
iter 800/2320: 5.006108283996582
iter 900/2320: 3.713489532470703
iter 1000/2320: 2.4658055305480957
iter 1100/2320: 3.010169744491577
iter 1200/2320: 2.7519514560699463
iter 1300/2320: 2.379098892211914
iter 1400/2320: 2.1198277473449707
iter 1500/2320: 2.5058813095092773
iter 1600/2320: 3.6669559478759766
iter 1700/2320: 2.3804259300231934
iter 1800/2320: 1.5029759407043457
iter 1900/2320: 2.728142261505127
iter 2000/2320: 2.3422579765319824
iter 2100/2320: 3.0906500816345215
iter 2200/2320: 1.9958736896514893
iter 2300/2320: 1.3129208087921143
Epoch 0, Train: 3.812208751203685, Validation: 1.9997827324869304
iter 0/2320: 2.2191359996795654
iter 100/2320: 1.444195032119751
iter 200/2320: 1.0104520320892334
iter 300/2320: 2.

In [24]:
def concat(word, tags):
    concated = ''
    part = word[0]
    last = tags[0]
    for letter, tag in zip(word[1:], tags[1:]):
        if tag[:2] == "B-":
            concated += part + ":" + last[2:] + '/'
            part = ''
        part += letter
        last = tag
    concated += part + ":" + last[2:]
        
    return concated

In [25]:
dataframe.head()

Unnamed: 0,word,morph
0,обескровленный,о:PREF/бес:PREF/кровл:ROOT/енн:SUFF/ый:END
1,подбелка,под:PREF/бел:ROOT/к:SUFF/а:END
2,якутянин,якут:ROOT/ян:SUFF/ин:SUFF
3,скомкиваться,с:PREF/ком:ROOT/к:PREF/ива:SUFF/ть:END/ся:POST
4,приоткрыться,при:PREF/от:PREF/кры:ROOT/ть:END/ся:POST


In [26]:
test_df = pd.read_csv("/kaggle/input/morpheme/morpheme_test.csv", names=["word"])

In [27]:
reverse_target = {i : token for token, i in target_tokenizer.items()}
reverse_target

{0: 'O',
 1: 'B-ROOT',
 2: 'I-ROOT',
 3: 'B-PREF',
 4: 'I-PREF',
 5: 'B-SUFF',
 6: 'I-SUFF',
 7: 'B-END',
 8: 'I-END',
 9: 'B-POST',
 10: 'I-POST',
 11: 'B-LINK',
 12: 'I-LINK',
 13: 'B-HYPH',
 14: 'I-HYPH'}

In [28]:
words = []
for word in test_df["word"]:
    letters = [tokenizer[ch] for ch in word]
    mask = torch.tensor([1 for i in range(len(letters))], dtype=torch.bool).to(device).unsqueeze(0)
    letters = torch.tensor(letters, dtype=torch.long).to(device).unsqueeze(0)

    ans = model(letters, mask=mask)
    lst = []
    for i in ans[0]:
        lst.append(reverse_target[i])
        
    words.append(concat(word, lst))

test_df["morph"] = words
test_df.head()

Unnamed: 0,word,morph
0,елизаветинский,елизавет:ROOT/ин:SUFF/ск:SUFF/ий:END
1,кинофикация,кинофик:ROOT/аци:SUFF/я:END
2,хиджра,хиджр:ROOT/а:END
3,магистерство,магистер:ROOT/ств:SUFF/о:END
4,педантический,педантич:ROOT/еск:SUFF/ий:END


In [None]:
model.eval()
valid_sum = 0

for i in range(len(valid_dataframe)):
    letters, tags = split_morpheme(valid_dataframe["morph"][i])
    mask = torch.tensor([1 for i in range(len(letters))], dtype=torch.bool).to(device).unsqueeze(0)
    letters = torch.tensor(letters, dtype=torch.long).to(device).unsqueeze(0)
    tags = torch.tensor(tags, dtype=torch.long).to(device).unsqueeze(0)

    model(letters, tags, mask).item()

In [38]:
test_df.to_csv("/kaggle/working/submission.csv", index=False)

In [43]:
word = ""
letters = [tokenizer[ch] for ch in word]
mask = torch.tensor([1 for i in range(len(letters))], dtype=torch.bool).to(device).unsqueeze(0)
letters = torch.tensor(letters, dtype=torch.long).to(device).unsqueeze(0)

ans = model(letters, mask=mask)
lst = []
for i in ans[0]:
    lst.append(reverse_target[i])
    
concat(word, lst)

'литв:ROOT/ин:SUFF'