In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if(device.type == "cuda"):
    device_name = "gpu"
else:
    device_name = "cpu"
device

In [3]:
data = pd.read_pickle("/kaggle/input/japanesetext/data.pkl")
label = pd.read_pickle("/kaggle/input/japanesetext/label.pkl")

In [4]:
df = pd.DataFrame(zip(data, label), columns=("text", "label"))

In [5]:
!pip install fugashi[unidic-lite]
!pip install mecab-python3
!pip install unidic-lite

In [8]:
#https://www.kaggle.com/code/kaerunantoka/my-preprocessing-for-japanese-text-data
import MeCab

class MecabTokenizer:
    def __init__(self):
        self.wakati = MeCab.Tagger('-Owakati')
        self.wakati.parse('')

    def tokenize(self, line):
        txt = self.wakati.parse(line)
        txt = txt.split()
        return txt
    
    def mecab_tokenizer(self, line):
        node = self.wakati.parseToNode(line)
        keywords = []
        while node:
            if node.feature.split(",")[0] == "名詞":
                keywords.append(node.surface)
            node = node.next
        return keywords 

In [9]:
tok = MecabTokenizer()
tok.mecab_tokenizer("kaggle days 楽しいイベントでしたね")

In [10]:
tok.tokenize("kaggle days 楽しいイベントでしたね")

In [11]:
print(df["text"][0])
print("-----------------")
print(tok.mecab_tokenizer(df["text"][0]))
print("-----------------")
print(tok.tokenize(df["text"][0]))

In [12]:
from tqdm.notebook import tqdm
tqdm.pandas()
import re

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]


html_tags = ['<p>', '</p>', '<table>', '</table>', '<tr>', '</tr>', '<ul>', '<ol>', '<dl>', '</ul>', '</ol>',
             '</dl>', '<li>', '<dd>', '<dt>', '</li>', '</dd>', '</dt>', '<h1>', '</h1>',
             '<br>', '<br/>', '<strong>', '</strong>', '<span>', '</span>', '<blockquote>', '</blockquote>',
             '<pre>', '</pre>', '<div>', '</div>', '<h2>', '</h2>', '<h3>', '</h3>', '<h4>', '</h4>', '<h5>', '</h5>',
             '<h6>', '</h6>', '<blck>', '<pr>', '<code>', '<th>', '</th>', '<td>', '</td>', '<em>', '</em>']

empty_expressions = ['&lt;', '&gt;', '&amp;', '&nbsp;', 
                     '&emsp;', '&ndash;', '&mdash;', '&ensp;'
                     '&quot;', '&#39;']

other = ['span', 'style', 'href', 'input']


def pre_preprocess(x):
    return str(x).lower()

def rm_spaces(text):
    spaces = ['\u200b', '\u200e', '\u202a', '\u2009', '\u2028', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\u3000', '\x10', '\x7f', '\x9d', '\xad',
              '\x97', '\x9c', '\x8b', '\x81', '\x80', '\x8c', '\x85', '\x92', '\x88', '\x8d', '\x80', '\x8e', '\x9a', '\x94', '\xa0', 
              '\x8f', '\x82', '\x8a', '\x93', '\x90', '\x83', '\x96', '\x9b', '\x9e', '\x99', '\x87', '\x84', '\x9f',
             ]
    for space in spaces:
            text = text.replace(space, ' ')
    return text

def remove_urls(x):
    x = re.sub(r'(https?://[a-zA-Z0-9.-]*)', r'', x)

    # original
    x = re.sub(r'(quote=\w+\s?\w+;?\w+)', r'', x)
    return x

def clean_html_tags(x, stop_words=[]):      
    for r in html_tags:
        x = x.replace(r, '')
    for r in empty_expressions:
        x = x.replace(r, ' ')
    for r in stop_words:
        x = x.replace(r, '')
    return x

def replace_num(text):
    text = re.sub('[0-9]{5,}', '', text)
    text = re.sub('[0-9]{4}', '', text)
    text = re.sub('[0-9]{3}', '', text)
    text = re.sub('[0-9]{2}', '', text)
    return text

def get_url_num(x):
    pattern = "https?://[\w/:%#\$&\?\(\)~\.=\+\-]+"
    urls = re.findall(pattern, x)
    return len(urls)


def clean_puncts(x):
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

#zenkaku = '０,１,２,３,４,５,６,７,８,９,（,）,＊,「,」,［,］,【,】,＜,＞,？,・,＃,＠,＄,％,＝'.split(',')
#hankaku = '0,1,2,3,4,5,6,7,8,9,q,a,z,w,s,x,c,d,e,r,f,v,b,g,t,y,h,n,m,j,u,i,k,l,o,p'.split(',')

def clean_text_jp(x):
    x = x.replace('。', '')
    x = x.replace('、', '')
    x = x.replace('\n', '') # 改行削除
    x = x.replace('\t', '') # タブ削除
    x = x.replace('\r', '')
    x = re.sub(re.compile(r'[!-\/:-@[-`{-~]'), ' ', x) 
    x = re.sub(r'\[math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\[\/math\]', ' LaTex math ', x) # LaTex削除
    x = re.sub(r'\\', ' LaTex ', x) # LaTex削除   
    #for r in zenkaku+hankaku:
    #    x = x.replace(str(r), '')
    x = re.sub(' +', ' ', x)
    return x


def preprocess(data):
    data = data.progress_apply(lambda x: pre_preprocess(x))
    data = data.progress_apply(lambda x: rm_spaces(x))
    data = data.progress_apply(lambda x: remove_urls(x))
    data = data.progress_apply(lambda x: clean_puncts(x))
    data = data.progress_apply(lambda x: replace_num(x))
    data = data.progress_apply(lambda x: clean_html_tags(x, stop_words=other))
    data = data.progress_apply(lambda x: clean_text_jp(x))
    return data

In [13]:
df['text'] = preprocess(df['text'])
df.head()

In [14]:
df['mecab_tokenizer'] = df['text'].progress_apply(lambda x: ' '.join(tok.mecab_tokenizer(x)))
# df['tokenize'] = df['text'].progress_apply(lambda x: ' '.join(tok.tokenize(x)))
df.head()

In [15]:
import gensim
url = "/kaggle/input/japanese-vector/entity_vector.model.bin"
embeddings = gensim.models.KeyedVectors.load_word2vec_format(url, binary=True)

In [16]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tok = Tokenizer()
tok.fit_on_texts(df['mecab_tokenizer'])
vocab_size = len(tok.word_index) + 1
encd_rev = tok.texts_to_sequences(df['mecab_tokenizer'])

In [17]:
vocab_size

In [18]:
max_rev_len=40
vocab_size = len(tok.word_index) + 1
embed_dim=200

In [19]:
pad_rev= pad_sequences(encd_rev, maxlen=max_rev_len, padding='post')
pad_rev.shape

In [20]:
embed_matrix=np.zeros(shape=(vocab_size,embed_dim))
for word,i in tok.word_index.items():
    try:
        embed_vector = embeddings[word] 
    except:
        pass
    if embed_vector is not None:
            embed_matrix[i]=embed_vector

In [21]:
embed_matrix.shape

In [None]:
# Y Axis df["label"]
#X axis pad_rev

In [22]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(pad_rev, df.label.values, test_size=0.25)

In [23]:
print("Train shape : ",train_X.shape)
print("Test shape : ",test_X.shape)
print("Train shape : ",train_y.shape)
print("Test shape : ",test_y.shape)

In [24]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
batch_size = 64


x_train = torch.tensor(train_X, dtype=torch.long)
y_train = torch.tensor(train_y, dtype=torch.long)
x_cv = torch.tensor(test_X, dtype=torch.long)
y_cv = torch.tensor(test_y, dtype=torch.long)


# Create Torch datasets
train = TensorDataset(x_train, y_train)
valid = TensorDataset(x_cv, y_cv)

# Create Data Loaders
train_loader = DataLoader(train, batch_size=batch_size, num_workers = os.cpu_count(), shuffle=True)
valid_loader = DataLoader(valid, batch_size=batch_size, num_workers = os.cpu_count(), shuffle=False)

In [25]:
for X, Y in train_loader:
    print(X.shape, Y.shape)

In [26]:
import math
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [27]:
from torch import nn, Tensor
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        self.pe = self.pe.squeeze()
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
position = torch.arange(40).unsqueeze(1)
div_term= torch.exp(torch.arange(0, 200, 2) * (-math.log(10000.0) / 200))
pe = torch.zeros(40, 1, 200)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)

In [None]:
pe.shape

In [28]:
class TransformerDecoder(nn.Module):
    def __init__(self, seq_len, embedding_dimension, n_cat):
        super(TransformerDecoder, self).__init__()
        self.decoder = nn.Linear(seq_len * embedding_dimension, n_cat)
        
    def forward(self, x):
        x = torch.flatten(x, start_dim=1)
        x = self.decoder(x)
        return x

In [29]:
weight = torch.FloatTensor(embed_matrix)

class TransformerNET(pl.LightningModule):
    def __init__(self,
                 vocab_size,
                 nhead,
                 seq_len,
                 n_cat,
                 embedding_dimension=200,
                 n_layers = 2,
                 dropout=0.1
                ):
        super().__init__()
        
        self.embedding_dimension = embedding_dimension
        
        self.encoder = nn.Embedding.from_pretrained(weight)
        self.encoder.weight.requires_grad = True
        
        self.pos_encoder = PositionalEncoding(max_len = seq_len, d_model = embedding_dimension)
        
        encoder_layers = nn.TransformerEncoderLayer(embedding_dimension, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
        self.decoder = TransformerDecoder(seq_len, embedding_dimension, n_cat)
        self.relu = nn.ReLU()
        self.ce = nn.CrossEntropyLoss()
        
    def accuracy(self,pred, y):
        pred = torch.argmax(pred, 1)
        correct_pred = (pred == y).float()
        acc = correct_pred.sum() / len(correct_pred)
        return acc
    
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, x):
        inputs = x
        mask = self._generate_square_subsequent_mask(len(x)).to(device)
        x = self.encoder(x) * math.sqrt(self.embedding_dimension)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x, mask)
        x = self.decoder(x)
        return x
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y = y.squeeze()
        x = x.view(x.size(0), -1)
        y_hat = self(x)
        loss = self.ce(y_hat, y)
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y= batch
        y = y.squeeze()
        x = x.view(x.size(0), -1)
        y_hat = self(x)
        loss = self.ce(y_hat, y)
        self.log("val_loss", loss)
        return loss
            
    def test_step(self, batch, batch_idx):
        x, y = batch
        y = y.squeeze()
        x = x.view(x.size(0), -1)
        y_hat = self(x)
        accuracy = self.accuracy(y_hat, y)
        #result = pl.EvalResult(checkpoint_on=accuracy)
        self.log('test_accuracy', accuracy)
        #print(accuracy)
        return {"test_accuracy", accuracy} 

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-4)
        return optimizer

In [33]:
print(vocab_size)
max_seq_len = 40
Num_Catagories = 26
model = TransformerNET(vocab_size = vocab_size, nhead = 2, seq_len = max_seq_len, n_cat = Num_Catagories)
trainer = pl.Trainer(callbacks=[EarlyStopping(monitor='val_loss', mode='min', patience= 30)], max_epochs = -1, accelerator= device_name, devices= 1)
trainer.fit(model, train_loader, valid_loader)

In [34]:
print(model.eval())
trainer.test(model, valid_loader)