In [1]:
# install MeCab
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3
# install NEologd
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -a
# install neologdn
!pip install neologdn
# install emoji
!pip install emoji

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  aptitude-common libcgi-fast-perl libcgi-pm-perl libclass-accessor-perl
  libcwidget3v5 libencode-locale-perl libfcgi-perl libhtml-parser-perl
  libhtml-tagset-perl libhttp-date-perl libhttp-message-perl libio-html-perl
  libio-string-perl liblwp-mediatypes-perl libparse-debianchangelog-perl
  libsigc++-2.0-0v5 libsub-name-perl libtimedate-perl liburi-perl libxapian30
  swig3.0
Suggested packages:
  aptitude-doc-en | aptitude-doc apt-xapian-index debtags tasksel
  libcwidget-dev libdata-dump-perl libhtml-template-perl libxml-simple-perl
  libwww-perl xapian-tools swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  aptitude aptitude-common libcgi-fast-perl libcgi-pm-perl
  libclass-accessor-perl libcwidget3v5 libencode-locale-perl libfcgi-perl
  libhtml-parser-perl libhtml-tagset-perl libhttp

In [0]:
import numpy as np
import pandas as pd
import MeCab
import re
import neologdn
import string
import emoji
import torch
import torchtext
from torchtext.vocab import Vectors
import random
import os

In [0]:
def seed_everything(seed=1234):
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True

seed_everything(seed=1234)

In [0]:
# テキスト前処理
def preprocessing_text(text):
    # 英語の小文字化(表記揺れの抑制)
    text = text.lower()
    # URLの除去(neologdnの後にやるとうまくいかないかも(URL直後に文章が続くとそれも除去される)))
    text = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-…]+', '', text)
    # tweetの前処理
    text = re.sub(r"@([A-Za-z0-9_]+) ", '', text) # リプライ
    text = re.sub(r'#(\w+)', '', text) # ハッシュタグ
    # neologdnを用いて文字表現の正規化(全角・半角の統一と重ね表現の除去)
    text = neologdn.normalize(text)
    # 数字を全て0に置換(解析タスク上、数字を重要視しない場合は語彙数増加を抑制するために任意の数字に統一したり除去することもある)
    text = re.sub(r'[0-9０-９]+', '0', text)
    # 半角記号の除去
    text = re.sub(r'[!-/:-@【】[-`{-~]', "", text)
    # 改行
    text = re.sub('\n', '', text)
    # 絵文字
    text = ''.join(['' if c in emoji.UNICODE_EMOJI else c for c in text])
    # 中黒や三点リーダ
    text = re.sub(r'[・…]', '', text)
    return text

# MeCab + NEologdによるtokenizer
def tokenizer_mecab(text):
    tagger = MeCab.Tagger('-Owakati -d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd') # -Owakatiで分かち書きのみ出力
    text = tagger.parse(text)
    text = text.strip().split()
    return text

# pipeline
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_mecab(text)
    
    return ret

In [0]:
PATH = '/content/drive/My Drive/Colab Notebooks/NLP/RionTweetClassifier/data/rion_corpus.csv'
max_length = 256

In [27]:
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")

LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

ds = torchtext.data.TabularDataset(path=PATH, format='csv', skip_header=True, fields=[('Text', TEXT), ('Label', LABEL)])

train_ds, test_ds, val_ds = ds.split(split_ratio=[0.92, 0.04, 0.04])

print('train_ds len :', len(train_ds))
print('val_ds len :', len(val_ds))
print('test_ds len :', len(test_ds))

train_ds len : 2515
val_ds len : 109
test_ds len : 110


In [22]:
# embedding
FASTTEXT = '/content/drive/My Drive/Colab Notebooks/NLP/nlp_tutorial/model.vec'
fastText_vectors = Vectors(name=FASTTEXT)
# build vocab
TEXT.build_vocab(train_ds, vectors=fastText_vectors, min_freq=1)

  0%|          | 0/351122 [00:00<?, ?it/s]Skipping token b'351122' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 350832/351122 [00:38<00:00, 8826.51it/s]

In [0]:
train_dl = torchtext.data.Iterator(train_ds, batch_size=2, train=True)
val_dl = torchtext.data.Iterator(val_ds, batch_size=2, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=2, train=False, sort=False)

In [24]:
batch = next(iter(train_dl))
print("="*50)
print(batch.Text[0][0])
print("="*50)
print(batch.Label)

tensor([  2, 140,  13,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  