<a href="https://colab.research.google.com/github/Yoshiki0418/Deep_Learning/blob/main/ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## NER

### データ準備

**データロード**

In [1]:
# Google Driveとの接続
from google.colab import drive
drive_path = '/content/drive'
drive.mount(drive_path)

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Deep_Learning/DL_Lecture3

/content/drive/MyDrive/Deep_Learning/DL_Lecture3


In [3]:
import pickle

In [4]:
def load_dataset(sentences_file_name="ner_dataset_sentences.txt", labels_file_name="ner_dataset_labels.txt"):
    with open(sentences_file_name, "rb") as fp:
        sentences = pickle.load(fp)
    with open(labels_file_name, "rb") as fp:
        labels = pickle.load(fp)
    return sentences, labels

In [5]:
sentences, labels = load_dataset()

In [6]:
sentences[:10]

['The cat sat on the mat .',
 'John lives in New York .',
 'I have two dogs .',
 'She works at Google .',
 'The Eiffel Tower is in Paris .',
 'He is from Spain .',
 'I visited the Great Wall of China .',
 'She is studying at Oxford University .',
 'He works for the United Nations .',
 'Berlin is the capital of Germany .']

In [7]:
labels[:10]

[['O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-per', 'O', 'O', 'B-geo', 'I-geo', 'O'],
 ['O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'B-org', 'O'],
 ['O', 'B-geo', 'I-geo', 'O', 'O', 'B-geo', 'O'],
 ['O', 'O', 'O', 'B-geo', 'O'],
 ['O', 'O', 'O', 'B-geo', 'I-geo', 'I-geo', 'I-geo', 'O'],
 ['O', 'O', 'O', 'O', 'B-org', 'I-org', 'O'],
 ['O', 'O', 'O', 'O', 'B-org', 'I-org', 'O'],
 ['B-geo', 'O', 'O', 'O', 'O', 'B-geo', 'O']]

**ラベルエンコーディング**

In [8]:
from sklearn.preprocessing import LabelEncoder

In [11]:
all_labels = [label for sublist in labels for label in sublist]
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)
encoded_labels = [label_encoder.transform(label) for label in labels]

In [13]:
encoded_labels[:3]

[array([6, 6, 6, 6, 6, 6, 6]),
 array([2, 6, 6, 0, 3, 6]),
 array([6, 6, 6, 6, 6])]

**辞書作成とエンコード**

In [14]:
# 辞書の初期化
word2idx = {"<PAD>": 0}

In [15]:
encoded_sentences = []
for sentence in sentences:
    encoded_sentence = [word2idx.setdefault(word, len(word2idx)) for word in sentence.split()]
    encoded_sentences.append(encoded_sentence)

In [16]:
encoded_sentences[:3]

[[1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 7], [13, 14, 15, 16, 7]]

**学習データと検証データ分割**

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val, train_sentences, val_sentences = train_test_split(encoded_sentences, encoded_labels, sentences, test_size=0.2, random_state=42)

In [19]:
X_train[:3]

[[94, 51, 95, 96, 97, 98, 7], [17, 18, 19, 87, 7], [1, 50, 51, 52, 10, 53, 7]]

In [20]:
train_sentences[:3]

['They are visiting London this summer .',
 'She works at Facebook .',
 'The Pyramids are located in Egypt .']

**padding**

In [22]:
import torch
from torch.nn.utils.rnn import pad_sequence

seq1 = torch.tensor([1, 2, 3])
seq2 = torch.tensor([4, 5])
seq3 = torch.tensor([6, 7, 8, 9])

sequences = [seq1, seq2, seq3]
padded_senquences = pad_sequence(sequences, batch_first=True)
print(padded_senquences)

tensor([[1, 2, 3, 0],
        [4, 5, 0, 0],
        [6, 7, 8, 9]])


In [23]:
X_train = pad_sequence([torch.tensor(x) for x in X_train], batch_first=True)
X_val = pad_sequence([torch.tensor(x) for x in X_val], batch_first=True)
y_train = pad_sequence([torch.tensor(y) for y in y_train], batch_first=True)
y_val = pad_sequence([torch.tensor(y) for y in y_val], batch_first=True)

In [24]:
X_train

tensor([[ 94,  51,  95,  96,  97,  98,   7,   0,   0,   0],
        [ 17,  18,  19,  87,   7,   0,   0,   0,   0,   0],
        [  1,  50,  51,  52,  10,  53,   7,   0,   0,   0],
        [  1,   2,   3,   4,   5,   6,   7,   0,   0,   0],
        [  1,  21,  22,  23,  10,  24,   7,   0,   0,   0],
        [ 25,  23,  60,  61,  24,  36,  62,  63,   7,   0],
        [ 25,  23,  26,  27,   7,   0,   0,   0,   0,   0],
        [  1,  54,  55,  23,  10,  56,   7,   0,   0,   0],
        [  1,  47,  23,   5,  48,  49,  10,   5,  46,   7],
        [  1,  82,  22,  31,  83,  23,  10,  84,   7,   0],
        [  8,   9,  10,  11,  12,   7,   0,   0,   0,   0],
        [ 13,  14,  15,  16,   7,   0,   0,   0,   0,   0],
        [  1,  88,  89,  23,  10,  90,   7,   0,   0,   0],
        [ 17,  18,  19,  20,   7,   0,   0,   0,   0,   0],
        [ 78,  79,  80,  62,  81,   7,   0,   0,   0,   0],
        [ 17,  23,  33,  19,   5,  35,  31,  91,   7,   0],
        [ 25,  67,  19,  68,   7,   0,  

In [25]:
X_train.shape

torch.Size([24, 10])

### NERのモデル

In [26]:
from torch import nn

In [27]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, rnn_type="LSTM", bidirectional=False):
        super().__init__()
        self.num_directions = 2 if bidirectional else 1

        # embedding layer追加 (vocab_size x embedding_size)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        input_size = embedding_dim

        if rnn_type == 'RNN':
            self.rnn = nn.RNN(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(input_size, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        else:
            raise ValueError('Unsupported RNN type. Choose from ["LSTM", "RNN", "GRU", "UGRNN"]')

        self.fc = nn.Linear(hidden_size*self.num_directions, output_size)

    def forward(self, x):
        x = self.embedding(x)
        output_seq, _ = self.rnn(x)

        # many to many の場合は，
        # output_seq: [batch_size, seq_len, hidden_size*num_directions]
        out = self.fc(output_seq)
        return out

In [31]:
# BiRNNのテスト
# input_size = 10
vocab_size = 300 # モデルが扱う語彙の総数（ユニークなトークンの数 + 特殊トークン）
embedding_dim = 50
hidden_size = 3
batch_size = 24
seq_len = 10. # paddingを含めた文章の長さ
output_size = 3

model = Model(vocab_size, embedding_dim, hidden_size, output_size, bidirectional=True, )
out = model(X_train)

In [32]:
out[0]

tensor([[-0.1755, -0.1338,  0.0672],
        [-0.0896, -0.4480,  0.1995],
        [-0.0361, -0.3296,  0.1930],
        [ 0.0121, -0.4118,  0.2468],
        [ 0.0512, -0.4223,  0.2425],
        [ 0.1781, -0.3577,  0.2110],
        [ 0.0789, -0.5249,  0.4050],
        [ 0.3242, -0.5079,  0.3783],
        [ 0.3157, -0.4999,  0.3796],
        [ 0.2968, -0.4966,  0.3711]], grad_fn=<SelectBackward0>)

### モデルの学習

In [39]:
from torch.utils.data import DataLoader, TensorDataset
vocab_size = len(word2idx)
num_classes = len(label_encoder.classes_)
embedding_dim = 50
hidden_size = 40
output_size = num_classes
batch_size = 3
learning_rate = 0.003
num_epochs = 40

# モデル作成
model = Model(vocab_size, embedding_dim, hidden_size, output_size, num_layers=1, rnn_type='LSTM', bidirectional=True)

# Data Loader作成
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 損失関数とOptimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# 学習ループ
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for i, (sentences, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        outputs = model(sentences)
        loss = criterion(outputs.view(-1, num_classes), labels.view(-1))

        loss.backward()

        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss:.4f}")

    # 検証データを使用して検証エラーを計算
    model.eval()
    val_loss = 0
    total_samples = 0
    total_correct = 0
    for sentences, labels in val_loader:

        outputs = model(sentences)
        # loss計算
        loss = criterion(outputs.view(-1, num_classes), labels.view(-1))
        val_loss += loss.item()

        # accuracy計算
        _, predicted = torch.max(outputs, dim=-1)
        # Padding部分を除外するマスクを作成
        non_pad_elements = labels != 0
        # Paddingを除いた予測結果とラベルを比較し、正解数をカウント
        total_correct += (predicted[non_pad_elements] == labels[non_pad_elements]).sum().item()
        # 正確なサンプル数（Paddingを除いた要素数）をカウント
        total_samples += non_pad_elements.sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = total_correct / total_samples
    print(f"Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

Epoch 1/40, Average Training Loss: 1.7674
Val Loss: 1.3924, Val Accuracy: 0.8485
Epoch 2/40, Average Training Loss: 1.0996
Val Loss: 0.6816, Val Accuracy: 0.8485
Epoch 3/40, Average Training Loss: 0.6477
Val Loss: 0.6044, Val Accuracy: 0.8485
Epoch 4/40, Average Training Loss: 0.5886
Val Loss: 0.5278, Val Accuracy: 0.8485
Epoch 5/40, Average Training Loss: 0.4375
Val Loss: 0.4967, Val Accuracy: 0.8485
Epoch 6/40, Average Training Loss: 0.3448
Val Loss: 0.4987, Val Accuracy: 0.8485
Epoch 7/40, Average Training Loss: 0.2558
Val Loss: 0.5331, Val Accuracy: 0.8485
Epoch 8/40, Average Training Loss: 0.1877
Val Loss: 0.4645, Val Accuracy: 0.8788
Epoch 9/40, Average Training Loss: 0.1430
Val Loss: 0.4722, Val Accuracy: 0.8788
Epoch 10/40, Average Training Loss: 0.1061
Val Loss: 0.5197, Val Accuracy: 0.8788
Epoch 11/40, Average Training Loss: 0.0812
Val Loss: 0.5475, Val Accuracy: 0.8788
Epoch 12/40, Average Training Loss: 0.0610
Val Loss: 0.5741, Val Accuracy: 0.8788
Epoch 13/40, Average Trai

### 検証データの出力確認

In [40]:
model.eval()
val_outputs = model(X_val)

In [41]:
_, predicted_labels = torch.max(val_outputs, dim=-1)

In [42]:
for sentence, label in zip(val_sentences, predicted_labels):
    words = sentence.split()
    decoded_labels = label_encoder.inverse_transform(label[:len(words)])
    print(f'original sentence: {sentence}')
    print(f'predicted labels: {decoded_labels}')
    print()

original sentence: He lives in Los Angeles .
predicted labels: ['O' 'O' 'O' 'O' 'O' 'O']

original sentence: She is from Australia .
predicted labels: ['O' 'O' 'O' 'O' 'O']

original sentence: The Great Barrier Reef is in Australia .
predicted labels: ['O' 'B-per' 'I-geo' 'I-geo' 'O' 'O' 'O' 'O']

original sentence: The Amazon is the largest rainforest .
predicted labels: ['O' 'O' 'O' 'O' 'O' 'O' 'O']

original sentence: He works for the United Nations .
predicted labels: ['O' 'O' 'O' 'O' 'O' 'O' 'O']

original sentence: Berlin is the capital of Germany .
predicted labels: ['O' 'O' 'O' 'O' 'O' 'O' 'O']

