In [104]:
!pip install transformers datasets conllu

In [9]:
#imports
import pandas as pd
import torch
from collections import Counter
from transformers import AutoModel, AutoTokenizer
import unicodedata
from datasets import load_dataset
import itertools

import os
import re
import numpy as np 
from sklearn.metrics import accuracy_score

import transformers

import torch
from torch import cuda
from tqdm import tqdm_notebook as tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

device

'cpu'

# The Data

Converting test, train and val datasets which include kanji to be purely kana (furigana)

In [None]:
data_files = {"train": "train_split.csv", "test": "test_split.csv", 'val': 'val_split.csv'}
# df = pd.DataFrame([['ホッケーにはデンジャラスプレーの反則があるので、膝より上にボールを浮かすことは基本的に反則に','ホッケーにはデンジャラスプレーのはんそくがあるので、しつよりじょうにボールをふかすことはきほ'],
#              ['また行きたい、そんな気持ちにさせてくれるお店です。','またこうきたい、そんなきじちにさせてくれるおてんです'],
#              ['手に持った特殊な刃物を使ったアクロバティックな体術や、揚羽と薄羽同様にクナイや忍具を使って攻','しゅにじったとくしゅなじんぶつをしったアクロバティックなたいじゅつや、よううとはくうどうよう'],
#              ['3年次にはトータルオフェンスで2,892ヤードを獲得し、これは大学記録となった。','3ねんじにはトータルオフェンスで2,892ヤードをかくとくし、これはだいがくきろくとなった。']],
#              columns = ['text','kana'])
dataset = load_dataset('universal_dependencies', 'ja_gsd')
train_split = dataset['train']
test_split = dataset['test']
val_split = dataset['validation']
train_split = df
test_split = df
val_split = df

#train_split_df = pd.DataFrame(data=train_split, columns=train_split.features)
train_split_df = train_split_df.drop(['lemmas','upos','xpos','feats','head','deprel','deps','misc'], axis=1)
train_split_df

test_split_df = pd.DataFrame(data=test_split, columns=test_split.features)
test_split_df = test_split_df.drop(['lemmas','upos','xpos','feats','head','deprel','deps','misc'], axis=1)
test_split_df

val_split_df = pd.DataFrame(data=val_split, columns=val_split.features)
val_split_df = val_split_df.drop(['lemmas','upos','xpos','feats','head','deprel','deps','misc'], axis=1)
val_split_df

In [None]:
print(len(train_split))
print(len(test_split))
print(len(val_split))
#train_split['text']

In [None]:
#load kanji data
kanji_df = pd.read_json("kanji.json")
kanji_df = kanji_df.drop(['strokes', 'grade', 'freq', 'jlpt_old', 'jlpt_new', 'meanings', 'wk_radicals', 'wk_readings_kun', 'wk_readings_on', 'wk_meanings', 'wk_level'], axis=0)

In [None]:
# As part of data collection, we need a dataset that is ONLY kana. 
# We could not find a dataset that had both kanji AND kana only version,
# so we took a dataset and converted the kanji to kana. 
# kanji to kana is less ambiguous with how on vs kun readings work, but there is still some abiguity.
# we recognize that here, in the dataset.
def unkanjify(df):
    new_sent = []
    for word in df['tokens']:
        #iterating list of chararacters
        for char in word:
            if char in kanji_df.columns:#is kanji
                if char in kanji_df.columns or char in kanji_df.columns:
                    if kanji_df[char].loc['readings_on']:
                        new_sent.append(kanji_df[char].loc['readings_on'][0])
                    else:
                        new_sent.append(char)
                #kun reading
                else:
                    for c in kanji_df[char].loc['readings_kun'][0]:
                        if c == '.':
                            continue
                        new_sent.append(c)
            else:#is not kanji
                new_sent.append(char)
    joined_sent = ''.join(new_sent)
    return joined_sent

# Applying the function to all splits (train, test and val)
train_split_df['kana'] = train_split_df.apply(lambda x: unkanjify(x), axis=1) 
test_split_df['kana'] = test_split_df.apply(lambda x: unkanjify(x), axis=1) 
val_split_df['kana'] = val_split_df.apply(lambda x: unkanjify(x), axis=1) 
train_split_df

# download and intialize models

In [None]:
from transformers import BertTokenizer, TFBertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
model = TFBertForMaskedLM.from_pretrained("cl-tohoku/bert-base-japanese")

# Concatinate the two stirngs

``` kana = "one two thirty-four"
kanji = "1 2 34"

"one two thirty-four" + " [SEP] " + "1 2 34"
kana + " [SEP] " + kanji

"one two thirty-four [SEP] 1 2 34"
3 = [SEP]
```

In [30]:
train_split_df['comibned']= train_split_df.text + "[SEP]" + train_split_df.kana
train_split_df.comibned

0    ホッケーにはデンジャラスプレーの反則があるので、膝より上にボールを浮かすことは基本的に反則に...
1    また行きたい、そんな気持ちにさせてくれるお店です。[SEP]またこうきたい、そんなきじちにさ...
2    手に持った特殊な刃物を使ったアクロバティックな体術や、揚羽と薄羽同様にクナイや忍具を使って攻...
3    3年次にはトータルオフェンスで2,892ヤードを獲得し、これは大学記録となった。[SEP]3...
Name: comibned, dtype: object

# Tokenize

convert everything into appropriate format for tenosrflow

In [89]:
inputs = tokenizer(train_split_df.comibned.values.tolist(),max_length=512,truncation=True,padding='max_length',return_tensors='np')
inputs

{'input_ids': array([[    2, 24543, 24893, ...,     0,     0,     0],
       [    2,   106,    77, ...,     0,     0,     0],
       [    2,   319,     7, ...,     0,     0,     0],
       [    2,    48,    19, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

# save the kanji text before we mask, on a seprate varibal

In [99]:
inputs['labels'] = inputs['input_ids']

# MASK kanji text

the apple [sep] fell down on the chari
1  2   64   3    7    234  34  75  12


the [sep][MASK] 
1   3   37   0   0 0   0   0  0 0

In [93]:
### inp_ids = []
inp_ids = []
idx = 0
for inp in inputs.input_ids:
    
    inp = np.array(inp)
    
    locatio_of_sep = np.where(inp == 3)[0][0]
    
    
    actual_tokens = list(set(range(locatio_of_sep,inp.shape[0])) - 
                         set(np.where((inp == tokenizer.sep_token_id) 
                            | (inp == 0))[0].tolist()))
    #We need to select 15% random tokens from the given list
    num_of_token_to_mask = int(len(actual_tokens)*1.0)
    token_to_mask = np.random.choice(np.array(actual_tokens), 
                                     size=num_of_token_to_mask, 
                                     replace=False).tolist()
    #Now we have the indices where we need to mask the tokens
    inp[token_to_mask] = tokenizer.mask_token_id
    inp_ids.append(inp)
    idx += 1
#inp_ids = tf.convert_to_tensor(inp_ids)
inputs['input_ids'] = np.array(inp_ids)


In [94]:
import tensorflow as tf

In [100]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002),loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
history = model.fit([inputs.input_ids,inputs.attention_mask],inputs.labels,verbose=1,batch_size=32,epochs=6)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [37]:
model.save_pretrained("kana_to_kanji")
tokenizer.save_pretrained("kana_to_kanji")

('kana_to_kanji/tokenizer_config.json',
 'kana_to_kanji/special_tokens_map.json',
 'kana_to_kanji/vocab.txt',
 'kana_to_kanji/added_tokens.json')

# run it, convert 

In [38]:
from transformers import BertTokenizer, TFBertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("kana_to_kanji")
model = TFBertForMaskedLM.from_pretrained("kana_to_kanji")

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at kana_to_kanji.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


sentance = "自らがオウム真理教ではない別のカルト団体に12年間所属していた経験をもとに [SEP] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]"
inp = tokenizer(sentance,return_tensors='np')
mask_loc = np.where(inp.input_ids[0] == 103)[0].tolist()
out = model(inp).logits[0]
predicted_tokens = np.argmax(out[mask_loc],axis=1).tolist()
tokenizer.decode(predicted_tokens)

In [39]:
test_split_df['masked_text'] = test_split_df.text + '[SEP]' +   "[MASK]"*150
test_split_df['unmasked_text']= test_split_df.text + "[SEP]" + test_split_df.kana

In [None]:
score = 0
base_score = 0

converted_sentance_list = []


def jaccard_similarity(sentence1, sentence2):
    set1 = set(sentence1)
    set2 = set(sentence2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    
    return len(intersection) / len(union)

for index,row in test_split_df.head(5).iterrows():
    #sentance = "自らがオウム真理教ではない別のカルト団体に12年間所属していた経験をもとに [SEP] [MASK] [MASK] [MASK] [MASK] [MASK] [MASK]"
    
    inp = tokenizer(row['masked_text'],return_tensors='np')
    inp2 = tokenizer(row['unmasked_text'],return_tensors='np')
    sep_loc = np.where(inp2.input_ids[0] == tokenizer.sep_token_id)[0][0] +1
    truth_sentance = (inp2.input_ids[0][sep_loc:-1])
    truth_sentance = tokenizer.decode(truth_sentance)
    original_sentance = (inp2.input_ids[0][1:sep_loc -1])
    original_sentance = tokenizer.decode(original_sentance)
    print(inp)

    mask_loc = np.where(inp.input_ids[0] == tokenizer.mask_token_id)[0].tolist()


    out = model(inp).logits[0].numpy()
    predicted_tokens = np.argmax(out[mask_loc],axis=1).tolist()
    predicted_sentance = tokenizer.decode(predicted_tokens)

    print(original_sentance)
    print(truth_sentance)
    print(predicted_sentance)
    base_score+=jaccard_similarity(truth_sentance,original_sentance)
    score+=jaccard_similarity(truth_sentance,predicted_sentance)
    
    
print(base_score/test_split_df.shape[0])
print(score/test_split_df.shape[0])
converted_sentance_list