# 第9章: RNN, CNN

In [105]:
import string
import nltk
import torch 
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

In [3]:
! wc -l "./data/train.txt"

   10684 ./data/train.txt


In [4]:
! head "./data/train.txt"

Bank Of America To Pay $9 Billion To Settle Mortgage Securities Suit	b
DOJ Pushing Credit Suisse To Plead Guilty To Aiding Tax Evasion	b
AT&T Says It May Avoid FCC Airwaves Auction Over Restrictions	t
US Airways Explains How It Tweeted That Infamous Nude Photo	e
WRAPUP 1-Vietnam stops anti-China protests after deadly riots, China evacuates	b
Sorry, Miss USA: Self-Defense Is Not The Solution To Sexual Assault	e
Our Mobile Apps	b
Legal Challenge To Alabama Abortion Law Will Go To Trial	m
George Clooney and fiancée Amal Alamuddin share a romantic dinner in Mexico	e
Mickey Rooney, a Hollywood icon	e


# 51. 特徴量抽出 ##
学習データ，検証データ，評価データから特徴量を抽出し，それぞれtrain.feature.txt，valid.feature.txt，test.feature.txtというファイル名で保存せよ．   
なお，カテゴリ分類に有用そうな特徴量は各自で自由に設計せよ．  
記事の見出しを単語列に変換したものが最低限のベースラインとなるであろう．

In [5]:
# tokenization 用の辞書をダウンロード
nltk.download('punkt') 

#単語が格納されたリストを返す関数
def tokenize_title(input_file):
    with open(input_file) as f:
        words = []
        for line in f:
            text , label = line.rstrip('\n').split('\t')
            table = str.maketrans("", "", string.punctuation)
            #語彙のリストに単語を追記していく
            #単語は小文字に直し、記号は除去
            text = text.translate(table).lower()
            words.extend(nltk.wordpunct_tokenize(text))
    
        return words
    
#カテゴリを抽出し、数字（b:0,e:1,t:2,m:3)に変換する関数
def cate2num(input_file):
    with open(input_file) as f:
        title_list = []
        category_list = []
        categories = ['b','e','t','m']
        
        for line in f:
            text , label = line.rstrip('\n').split('\t')
            category_list.append(label)
            title_list.append(text)
         
        #リストのインデックス番号を当てはめる
        data_y = [categories.index(i) for i in category_list]
    
        return torch.tensor(data_y)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yukikoishizuki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 80. ID番号への変換
問題51で構築した学習データ中の単語にユニークなID番号を付与したい．学習データ中で最も頻出する単語に1，2番目に頻出する単語に2，……といった方法で，学習データ中で2回以上出現する単語にID番号を付与せよ．そして，与えられた単語列に対して，ID番号の列を返す関数を実装せよ．ただし，出現頻度が2回未満の単語のID番号はすべて0とせよ．

In [6]:
#問題５１の学習データの読み込み
train_data = "./data/train.txt"
train_words = tokenize_title(train_data)
train_y = cate2num(train_data)

In [7]:
import json
from collections import Counter
from collections import defaultdict

#ID番号の付与
#学習データの単語のうち２回以上出現するものを取得
vocab = {}
cnt = Counter(train_words).most_common()
#1から始まるので注意
vocab = {cnt[0]:idx+1 for idx , cnt in enumerate(cnt) if cnt[1]>1}

#vocabを中間ファイルに出力
with open('./data/vocab.json','w') as f:
    d = json.dumps(vocab)
    f.write(d)

In [8]:
#与えられた単語列に関してID番号の列を返す関数
def word_to_id(words,vocab):
    
    word_to_id = []
    for word in words:
        
        #もしvocabに存在する単語なら、  
        if word in vocab.keys():
            #そのままvocabのID=vocabのvalueを付与
            idx = vocab[word]
            word_to_id.append(idx)
    
        #存在しない＝出現頻度が2回未満であるなら、
        else:
            #IDを0に 
            idx = 0
            word_to_id.append(idx)
        
    return torch.tensor(word_to_id)

with open('./data/vocab.json', 'r') as f:
    v = json.load(f)
            
words = tokenize_title(train_data)[:10]
print(word_to_id(words,vocab))

tensor([ 71,   4, 182,   1, 124, 724,  49,   1, 725, 500])


# 81. RNNによる予測

## MEMO
https://pytorch.org/docs/stable/generated/torch.nn.RNN.html
https://www.kaggle.com/kanncaa1/recurrent-neural-network-with-pytorch
#### RNN：Recurrent Neural Network
- 再帰ニューラルネットワーク
- 時系列データや文章など、可変長の系列データに使われる
- ベクトル化→word embeddings→hidden state→output distribution っていうのが基本的なRNN

#### 1.RNNについて
- 入力テンソルの形状は(seq_len,batch,input_size)
- nn.RNN()の出力→隠れ層rnn_outのうち最後の時刻のものh_nを次の全結合層に入力させる
-　h_nに全時刻分の情報が含まれる

#### 2.Embedding層について
入力は単語IDの並び、出力は単語埋め込みベクトル
入力データ型は<class 'torch.Tensor'>
https://gotutiyan.hatenablog.com/entry/2020/09/02/200144https://gotutiyan.hatenablog.com/entry/2020/09/02/200144

ミニバッチ化しているとき、出力はTorch.size(seq_len, batch, input_size)

In [9]:
#上で用意した関数を使いやすい形に書き換え(tensorを返り値にする)

#与えられた単語列に関してID番号の列を返す関数
def word2id(title,vocab):
    
    word2id = []
    for word in title:
        
        #もしvocabに存在する単語なら、  
        if word in vocab.keys():
            #そのままvocabのID=vocabのvalueを付与
            idx = vocab[word]
            word2id.append(idx)
    
        #存在しない＝出現頻度が2回未満であるなら、
        else:
            #IDを0に 
            idx = 0
            word2id.append(idx)
        
    return word2id

#titleを単語列にし、id化、さらにone-hotに直す関数(この章では必要…？)
def title2id(title_data,vocab):
    
    title = nltk.wordpunct_tokenize(title_data)
    idxed_title = word_to_id(title,vocab) #返り値はword_to_idのtensor
    
    #ゼロ行列を準備（タイトルの系列数,vocab+未知語）
    data_x = torch.zeros(len(idxed_title),len(vocab)+1)
    for i, idx in enumerate(idxed_title):
        #i行目のidx列目の0を1に
        data_x[i][idx] = 1
    return data_x
        
#カテゴリを抽出し、数字（b:0,e:1,t:2,m:3)に変換する関数
#引数に指定するものをファイルからカテゴリに変更
def cate2id(cate_data):

    categories = ['b','e','t','m']

    #リストのインデックス番号を当てはめる
    data_y = [categories.index(cate_data)]
    return torch.tensor(data_y)

In [10]:
#RNNの定義
class RNN(nn.Module):
    def __init__(self):
        super(RNN,self).__init__()
        self.emb = nn.Embedding(input_size,emb_size,padding_idx=padding_idx)
        #デフォルトでは活性化関数はtanh
        #batch_firstをTrueにすると，(seq_len, batch, input_size)→(batch, seq_len, input_size)にできる
        self.rnn = nn.RNN(emb_size,hidden_size,batch_first=True)
        #readout layer
        self.fc = nn.Linear(hidden_size,output_size)
        self.softmax = nn.Softmax(dim=-1)
        
    def forward(self,x):
        #x=batch_size*sequence_size
        #隠れ層の初期化
        print("x",x.shape)
        #h_0 of shape(num_layers * num_directions（双方向ではないので１), batch, hidden_size)
        h = torch.zeros(1,1,hidden_size)
        emb = self.emb(x)
        
        #embの入力に次元を揃える(→view),出力：[batch, seq_len, input_size]
        emb = emb.view(1,len(x),-1)
        print('emb shape:',emb.shape)
        print('h:',h.shape)
        output,h = self.rnn(emb,h)
        
        
        #データの最後だけを全結合層に送り込む
        output = self.fc(output[:,-1,:])
        print('output shape:',output.shape)
        y = self.softmax(output)
        return y
                      

In [11]:
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)

#モデルの定義
model = RNN()


#予測
#適当な系列を準備
train_data = "./data/train.txt"
words = tokenize_title(train_data)[:10]
a = torch.tensor(word2id(words,vocab))
print(a)

print(model(a))

#埋め込み層の表示
print('\nEmbedding layer\n',model.emb(a))

tensor([ 71,   4, 182,   1, 124, 724,  49,   1, 725, 500])
x torch.Size([10])
emb shape: torch.Size([1, 10, 300])
h: torch.Size([1, 1, 50])
output shape: torch.Size([1, 4])
tensor([[0.2143, 0.4433, 0.1439, 0.1985]], grad_fn=<SoftmaxBackward>)

Embedding layer
 tensor([[-0.0143,  0.2950, -1.8347,  ..., -0.7566, -0.4555,  0.6362],
        [ 0.6020, -0.4254, -1.4406,  ...,  1.1061, -1.3562,  0.0595],
        [ 0.1806, -0.3414,  0.9043,  ..., -0.8388, -0.4679, -0.2461],
        ...,
        [-0.0920, -0.4319, -0.9209,  ...,  0.1086,  0.4595, -1.0695],
        [-2.9050, -0.7189, -0.7964,  ..., -1.2088, -1.8805,  1.3720],
        [-0.6694, -0.4699, -0.1329,  ...,  0.4213, -0.4114, -0.7081]],
       grad_fn=<EmbeddingBackward>)


# 82.確率的勾配降下法による学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題81で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

## MEMO
- カテゴリ列の取得は文単位でなされるので、これを単語とカテゴリのペアとしてdatasetを作成する必要がある
- 入力と正解のラベルのTensor dimが異なるとAssertionError
    - https://discuss.pytorch.org/t/assert-all-tensors-0-size-0-tensor-size-0-for-tensor-in-tensors-assertionerror/41608
- torch.utils.data
  - https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader
  - https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#Dataset
  - DataLoaderはDataset継承クラスのオブジェクトを渡す
  - 全データ数を返す関数として__len__関数を定義し、さらに、番号を受け取りその番号にあたるデータを返す関数として__getitem__を定義
- Padding
    - https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pad_sequence.html


Datasetを作るときは入力と出力のtensorの次元を揃えないといけないので、　

train_x:10684個のタイトル・train_y:10684個のラベル　

にしたが、実際のRNNでは入力は系列データ108805個（padding前)
1文の系列（例えば10単語）に対して1つの正解ラベルを渡すはず?
この時の挙動はどうなる？→__len__関数を設定する理由はこれ！

TensorDatasetを使えばこれをしなくてもできる

In [12]:
#Tensordataに入れる出力と同じ系列長のTensorを返す関数
def get_x_data(data_f,vocab):
    #１タイトルの単語列とそのラベルをペアにする
    #
    titles = []
    words = []
    labels = []
    categories = ['b','e','t','m']
    with open(data_f) as f:
        for line in f:
            text , label = line.rstrip('\n').split('\t')
            label = categories.index(label)
            labels.append(label)
         
            table = str.maketrans("", "", string.punctuation)
            #語彙のリストに単語を追記していく
            #単語は小文字に直し、記号は除去
            text = text.translate(table).lower()
            words = nltk.wordpunct_tokenize(text)
            ided_words = word2id(words,vocab)
            titles.append(ided_words)

    return titles , labels
        

In [13]:
with open('./data/vocab.json', 'r') as f:
    vocab = json.load(f)
    
#系列長の異なるリストをpaddingし、datasetを作成する関数
def get_data(titles, labels ,paddin_value=padding_idx):
    data = []
    for title in titles:
        data.append(torch.tensor(title)[:,None])

    #xは(バッチ，系列長，特徴量)
    x = pad_sequence(data,batch_first=True, padding_value=padding_idx)
    print(x.size())
    #(バッチ、特徴量、系列長)に変換
    print(x.permute(0,2,1))
    print(x.size(0))
    y = torch.tensor(labels)
    print(y.size(0))
    x_len = torch.tensor([len(i) for i in data])
    #各データの系列長を取得しておく
   
    return  x,x_len,y


#挙動を確かめるために小さいデータを用意
train_f = "./data/train2.txt"
titles , labels = get_x_data(train_f,vocab)
train_x , train_x_len , train_y = get_data(titles,labels)
TensorDataset(train_x,train_x_len,train_y)
print(train_y)

torch.Size([10, 12, 1])
tensor([[[  71,    4,  182,    1,  124,  724,   49,    1,  725,  500, 1982,
           812]],

        [[4554, 3214,  272,  898,    1, 2445,  501,    1, 5772,  366, 5773,
          8186]],

        [[ 726,   17,   38,   33, 1649,  650,    0, 1398,   23, 2775, 8186,
          8186]],

        [[  10, 1983, 1529,  100,   38, 5774,   39, 4555, 1287,  410, 8186,
          8186]],

        [[ 206, 4556, 2776, 3215, 1984,   15,  686, 5775,   30, 5776, 8186,
          8186]],

        [[1399,  945, 1985, 5777,   14,   36,    3, 3778,    1,  946,  947,
          8186]],

        [[ 526,  566, 1650, 8186, 8186, 8186, 8186, 8186, 8186, 8186, 8186,
          8186]],

        [[ 948,  527,    1, 4557, 2446,  727,   32,  354,    1,  687, 8186,
          8186]],

        [[ 168,  502,    9, 3216, 2184, 2447,  550,   11, 3217, 3218,    2,
          1400]],

        [[1796, 1651,   11,  772, 3779, 8186, 8186, 8186, 8186, 8186, 8186,
          8186]]])
10
10
tensor([0, 0, 2, 1, 

In [14]:
#RNNの定義
class RNN(nn.Module):
    def __init__(self,input_size,emb_size,hidden_size,output_size):
        super(RNN,self).__init__()
        self.emb = nn.Embedding(input_size,emb_size,padding_idx=padding_idx)
        #デフォルトでは活性化関数はtanh
        #batch_firstをTrueにすると，(seq_len, batch, input_size)→(batch, seq_len, input_size)にできる
        self.rnn = nn.RNN(emb_size,hidden_size,batch_first=True)
        #readout layer
        self.fc = nn.Linear(hidden_size,output_size)
        #self.softmax = nn.Softmax(dim=-1)
        
    def forward(self,x,x_len):
        #x=batch_size*sequence_size
        #隠れ層の初期化
        #h_0 of shape(num_layers * num_directions（双方向ではないので１), batch, hidden_size)
        #h = torch.zeros(1,1,hidden_size)
        emb = self.emb(x)
        #print('emb',emb.size)
        packed_emb = pack_padded_sequence(emb, x_len, batch_first=True,enforce_sorted=False)
        output, h_n = self.rnn(packed_emb)
        
        #データの最後だけを全結合層に送り込む
        y = self.fc(h_n[-1])
        #print('y shape:',y.shape)
        #y = self.softmax(output)
        return y

In [17]:
from sklearn.metrics import accuracy_score

#損失と正解率の計算
def calc_loss_and_acc(rnn, loss_func, dataset):
    
    data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
    loss = 0
    correct = 0
    total = 0

    for x, x_len, y in data_loader:
        logit = rnn.forward(x,x_len)
        loss += loss_func(logit, y).item()
        total += len(y)
        pred = torch.argmax(logit, dim=-1)
        correct += torch.sum(pred == y).item()
    
    loss = total_loss / len(data_loader)
    acc = correct / total
    return loss, acc

def train(batch_size, num_epoch, learning_rate, train_ds, valid_ds, output_file, rnn):
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(rnn.parameters(), lr=lr)
    train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)
    


    #訓練
    for epoch in tqdm(range(1,num_epoch+1)):
        for x, x_len, y in train_loader:
            optimizer.zero_grad()
            logit = rnn(x,x_len)
            loss = loss_func(logit, y)
            loss.backward()
            optimizer.step()
            

        train_loss , train_acc = calc_loss_and_acc(rnn,loss_func,train_ds)
        valid_loss , valid_acc = calc_loss_and_acc(rnn,loss_func,valid_ds)
        print("Epoch:{}\ttrain loss:{}\ttrain acc{}".format(epoch, train_loss, train_acc))        
        print("Epoch:{}\tvalid loss:{}\tvalid acc{}".format(epoch, valid_loss, valid_acc))

    #model.state_dictメソッドでモデルのパラメータを保存できる
    #optimizer.state_dict()→内部状態を保存するメソッド
    torch.save({'model_state_dict':model.state_dict(), 'optimizer_state_dict':optimizer.state_dict()} , output_file)

In [19]:
from tqdm import tqdm

#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 1
num_epoch = 5
lr = 0.01

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x)
train_ds = TensorDataset(train_x,train_x_len,train_y)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x)
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)

rnn = RNN(input_size,emb_size,hidden_size,output_size)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(rnn.parameters(), lr=lr)
train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)

train(batch_size, num_epoch, lr, train_ds, valid_ds, "./work/model82.pt", rnn)

  0%|          | 0/5 [00:00<?, ?it/s]

torch.Size([10684, 19, 1])
tensor([[[  71,    4,  182,  ..., 8186, 8186, 8186]],

        [[4554, 3214,  272,  ..., 8186, 8186, 8186]],

        [[ 726,   17,   38,  ..., 8186, 8186, 8186]],

        ...,

        [[1606, 1707,    6,  ..., 8186, 8186, 8186]],

        [[ 340,  388, 2119,  ..., 8186, 8186, 8186]],

        [[  45,    0,    3,  ..., 8186, 8186, 8186]]])
10684
10684
torch.Size([1336, 16, 1])
tensor([[[1475, 7683,  857,  ..., 8186, 8186, 8186]],

        [[  30,  555, 1015,  ..., 8186, 8186, 8186]],

        [[1265,    0,   23,  ..., 8186, 8186, 8186]],

        ...,

        [[ 206,    0,    0,  ..., 8186, 8186, 8186]],

        [[   8,    0,   44,  ..., 8186, 8186, 8186]],

        [[2055,   46, 7759,  ..., 8186, 8186, 8186]]])
1336
1336


 20%|██        | 1/5 [01:24<05:36, 84.24s/it]

Epoch:1	train loss:1.0072693397834596	train acc0.7323099962560838
Epoch:1	valid loss:8.055138941801259	valid acc0.7058383233532934


 40%|████      | 2/5 [02:40<03:59, 79.69s/it]

Epoch:2	train loss:1.0072693397834596	train acc0.7616997379258704
Epoch:2	valid loss:8.055138941801259	valid acc0.7267964071856288


 60%|██████    | 3/5 [03:36<02:17, 68.97s/it]

Epoch:3	train loss:1.0072693397834596	train acc0.8082178959191314
Epoch:3	valid loss:8.055138941801259	valid acc0.7425149700598802


 80%|████████  | 4/5 [04:37<01:05, 65.62s/it]

Epoch:4	train loss:1.0072693397834596	train acc0.8371396480718832
Epoch:4	valid loss:8.055138941801259	valid acc0.7544910179640718


100%|██████████| 5/5 [05:33<00:00, 66.72s/it]

Epoch:5	train loss:1.0072693397834596	train acc0.8495881692250093
Epoch:5	valid loss:8.055138941801259	valid acc0.750748502994012





# 83. ミニバッチ化・GPU上での学習
問題82のコードを改変し，B事例ごとに損失・勾配を計算して学習を行えるようにせよ（Bの値は適当に選べ）．また，GPU上で学習を実行せよ．



## MEMO
- 作ったデータは一度ファイルに書き出すべし
- paddingとpackingの違いは？
    - https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html
    - https://cod-aid.com/pytorch-pack
    - pad_sequence:sequenceをpadding,pad_packed_sequence:packed sequenceをpadding
    - 系列長の異なるTensorをリスト化、padding,packするという流れ→RNNへ、
    

In [20]:
#GPU上での学習は省略
#パラメータの更新に全ての訓練データを用いるのがバッチ学習
#それに対して一部の訓練データを取り出すのがミニバッチ学習

import torch 
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

#Tensordataに入れる出力と同じ系列長のTensorを返す関数
def get_x_data(data_f,vocab):
    #１タイトルの単語列とそのラベルをペアにする
    titles = []
    words = []
    labels = []
    categories = ['b','e','t','m']
    with open(data_f) as f:
        for line in f:
            text , label = line.rstrip('\n').split('\t')
            label = categories.index(label)
            labels.append(label)
         
            table = str.maketrans("", "", string.punctuation)
            #語彙のリストに単語を追記していく
            #単語は小文字に直し、記号は除去
            text = text.translate(table).lower()
            words = nltk.wordpunct_tokenize(text)
            ided_words = word2id(words,vocab)
            titles.append(ided_words)

    return titles , labels
        
with open('./data/vocab.json', 'r') as f:
    vocab = json.load(f)
    
#系列長の異なるリストをpaddingし、datasetを作成する関数
def get_data(titles, labels ,paddin_value=padding_idx):
    data = []
    for title in titles:
        data.append(torch.tensor(title)[:,None])

    #xは(バッチ，系列長，特徴量)
    x = pad_sequence(data,batch_first=True, padding_value=padding_idx)
    #print(x.size())
    #(バッチ、特徴量、系列長)に変換
    #print(x.permute(0,2,1))
    #print(x.size(0))
    y = torch.tensor(labels).long()
    #print(y.size(0))
    x_len = torch.tensor([len(i) for i in data]).long()
    #各データの系列長を取得しておく
   
    return  x,x_len,y


In [21]:
#RNNの定義
class RNN(nn.Module):
    def __init__(self,input_size,emb_size,hidden_size,output_size):
        super(RNN,self).__init__()
        self.emb = nn.Embedding(input_size,emb_size,padding_idx=padding_idx)
        #デフォルトでは活性化関数はtanh
        #batch_firstをTrueにすると，(seq_len, batch, input_size)→(batch, seq_len, input_size)にできる
        self.rnn = nn.RNN(emb_size,hidden_size,batch_first=True)
        #readout layer
        self.fc = nn.Linear(hidden_size,output_size)
        #self.softmax = nn.Softmax(dim=-1)
        
    def forward(self,x,x_len):
        #x=batch_size*sequence_size
        #隠れ層の初期化
        #h_0 of shape(num_layers * num_directions（双方向ではないので１), batch, hidden_size)
        #h = torch.zeros(1,1,hidden_size)
        emb = self.emb(x)
        packed_emb = pack_padded_sequence(emb, x_len, batch_first=True,enforce_sorted=False)
        #print('packed emb shape:',packed_emb)
        output, h_n = self.rnn(packed_emb)
        
        #データの最後だけを全結合層に送り込む
        y = self.fc(h_n[-1])
        return y

In [22]:
from sklearn.metrics import accuracy_score

#損失と正解率の計算
def calc_loss_and_acc(rnn, loss_func, dataset):
    
    data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
    loss = 0
    correct = 0
    total = 0

    for x, x_len, y in data_loader:
        logit = rnn.forward(x,x_len)
        loss += loss_func(logit, y).item()
        total += len(y)
        pred = torch.argmax(logit, dim=-1)
        correct += torch.sum(pred == y).item()
    
    loss = total_loss / len(data_loader)
    acc = correct / total
    return loss, acc

def train(batch_size, num_epoch, learning_rate, train_ds, valid_ds, output_file, rnn):
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(rnn.parameters(), lr=lr)
    train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)
    


    #訓練
    for epoch in tqdm(range(1,num_epoch+1)):
        for x, x_len, y in train_loader:
            optimizer.zero_grad()
            logit = rnn(x,x_len)
            loss = loss_func(logit, y)
            loss.backward()
            optimizer.step()
            

        train_loss , train_acc = calc_loss_and_acc(rnn,loss_func,train_ds)
        valid_loss , valid_acc = calc_loss_and_acc(rnn,loss_func,valid_ds)
        print("Epoch:{}\ttrain loss:{}\ttrain acc{}".format(epoch, train_loss, train_acc))        
        print("Epoch:{}\tvalid loss:{}\tvalid acc{}".format(epoch, valid_loss, valid_acc))

    #model.state_dictメソッドでモデルのパラメータを保存できる
    #optimizer.state_dict()→内部状態を保存するメソッド
    torch.save({'model_state_dict':model.state_dict(), 'optimizer_state_dict':optimizer.state_dict()} , output_file)

In [23]:
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 32

rnn = RNN(input_size,emb_size,hidden_size,output_size)

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x)
train_ds = TensorDataset(train_x,train_x_len,train_y)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x)
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)

train(batch_size, 10, 0.01, train_ds, valid_ds, "./work/model83.pt", rnn)

 10%|█         | 1/10 [00:03<00:34,  3.85s/it]

Epoch:1	train loss:32.220555767205035	train acc0.4772557094721078
Epoch:1	valid loss:256.23013395824955	valid acc0.43637724550898205


 20%|██        | 2/10 [00:07<00:32,  4.02s/it]

Epoch:2	train loss:32.220555767205035	train acc0.5029015350056159
Epoch:2	valid loss:256.23013395824955	valid acc0.45434131736526945


 30%|███       | 3/10 [00:12<00:29,  4.24s/it]

Epoch:3	train loss:32.220555767205035	train acc0.5276113815050543
Epoch:3	valid loss:256.23013395824955	valid acc0.47305389221556887


 40%|████      | 4/10 [00:16<00:24,  4.01s/it]

Epoch:4	train loss:32.220555767205035	train acc0.5517596405840509
Epoch:4	valid loss:256.23013395824955	valid acc0.4977544910179641


 50%|█████     | 5/10 [00:19<00:19,  3.89s/it]

Epoch:5	train loss:32.220555767205035	train acc0.5712280044926994
Epoch:5	valid loss:256.23013395824955	valid acc0.5194610778443114


 60%|██████    | 6/10 [00:23<00:15,  3.88s/it]

Epoch:6	train loss:32.220555767205035	train acc0.5936915013103706
Epoch:6	valid loss:256.23013395824955	valid acc0.5419161676646707


 70%|███████   | 7/10 [00:27<00:11,  3.82s/it]

Epoch:7	train loss:32.220555767205035	train acc0.616061400224635
Epoch:7	valid loss:256.23013395824955	valid acc0.5576347305389222


 80%|████████  | 8/10 [00:31<00:07,  3.76s/it]

Epoch:8	train loss:32.220555767205035	train acc0.6416136278547361
Epoch:8	valid loss:256.23013395824955	valid acc0.5770958083832335


 90%|█████████ | 9/10 [00:34<00:03,  3.74s/it]

Epoch:9	train loss:32.220555767205035	train acc0.6714713590415574
Epoch:9	valid loss:256.23013395824955	valid acc0.6115269461077845


100%|██████████| 10/10 [00:38<00:00,  3.87s/it]

Epoch:10	train loss:32.220555767205035	train acc0.7036690378135529
Epoch:10	valid loss:256.23013395824955	valid acc0.6549401197604791





# 84. 単語ベクトルの導入
事前学習済みの単語ベクトル（例えば，Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル）で単語埋め込みemb(x)を初期化し，学習せよ．

### MEMO
- https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
- https://kento1109.hatenablog.com/entry/2018/03/15/153652
- https://stackoverflow.com/questions/49710537/pytorch-gensim-how-to-load-pre-trained-word-embeddings/49802495#49802495

In [24]:
from gensim.models import KeyedVectors
#gensimで埋め込んだ単語とその分散表現をRNNの埋め込み層に組み込む

#学習済み単語ベクトルの習得（300次元)
vecs = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

In [25]:
weights = torch.FloatTensor(vecs.vectors)
#weights = vecs.wv
#weights = weights.syn0
print(weights.shape)
#300万単語、300次元

#vocabに存在する単語について、その単語ID番目の単語ベクトルを紐付ける
weights_with_word2vec = np.zeros((input_size,300))
print(weights_with_word2vec.shape)
words_in_w2v = 0
for word , idx in vocab.items():
    if word in vecs:
        weights_with_word2vec[idx] = vecs[word].astype(np.float32)
        words_in_w2v += 1
weights_with_word2vec = torch.from_numpy(weights_with_word2vec).float()
print(weights_with_word2vec.size())

print(f'words in w2v / vocab : {words_in_w2v} / {input_size}')

torch.Size([3000000, 300])
(8187, 300)
torch.Size([8187, 300])
words in w2v / vocab : 6774 / 8187


In [26]:
print(weights.dtype)

torch.float32


In [27]:
#RNNの再定義
class RNN_pretrained_weight(nn.Module):
    def __init__(self,input_size,emb_size,hidden_size,output_size,weights):
        super().__init__()
        #emb_size:300
        #emb_size = weights[1]
        #重みのセット
        #print(weights.type())
        self.emb = nn.Embedding.from_pretrained(weights.float(),padding_idx=padding_idx)
        self.rnn = nn.RNN(emb_size,hidden_size,batch_first=True)
        #readout layer
        self.fc = nn.Linear(hidden_size,output_size)
        #self.softmax = nn.Softmax(dim=-1)
        
    def forward(self,x,x_len):
        emb = self.emb(x).float()
        #print('emb:',emb.size())
        packed_emb = pack_padded_sequence(emb, x_len, batch_first=True,enforce_sorted=False)
        #print('packed emb shape:',packed_emb)
        output, h = self.rnn(packed_emb)
        #データの最後だけを全結合層に送り込む
        y = self.fc(h[-1])
        return y

In [28]:
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 1
weights = weights_with_word2vec

#適当な系列を準備
f = "./data/train2.txt"
titles , labels = get_x_data(f,vocab)
x, x_len, y = get_data(titles,labels)
x = torch.squeeze(x)
ds = TensorDataset(x,x_len,y)

#print(rnn(x,x_len))
#埋め込み層の表示
#print('\nEmbedding layer\n',rnn.emb(x))

#変更後
new_rnn = RNN_pretrained_weight(input_size,emb_size,hidden_size,output_size,weights)
#print(new_rnn.forward(x,x_len))
#print('\nPretrained Embedding layer\n',rnn.emb(x))

In [31]:
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 32
num_epoch = 10
lr = 0.01

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x).long()
train_ds = TensorDataset(train_x,train_x_len,train_y)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x).long()
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)

new_rnn = RNN_pretrained_weight(input_size,emb_size,hidden_size,output_size,weights)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(rnn.parameters(), lr=lr)
train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)

train(batch_size, num_epoch, lr, train_ds, valid_ds, "./work/model83.pt", new_rnn)

 10%|█         | 1/10 [00:02<00:18,  2.01s/it]

Epoch:1	train loss:32.220555767205035	train acc0.6014601272931487
Epoch:1	valid loss:256.23013395824955	valid acc0.5898203592814372


 20%|██        | 2/10 [00:04<00:16,  2.04s/it]

Epoch:2	train loss:32.220555767205035	train acc0.7623549232497192
Epoch:2	valid loss:256.23013395824955	valid acc0.7537425149700598


 30%|███       | 3/10 [00:06<00:13,  1.99s/it]

Epoch:3	train loss:32.220555767205035	train acc0.7668476226132535
Epoch:3	valid loss:256.23013395824955	valid acc0.7597305389221557


 40%|████      | 4/10 [00:07<00:11,  1.98s/it]

Epoch:4	train loss:32.220555767205035	train acc0.7954885810557843
Epoch:4	valid loss:256.23013395824955	valid acc0.7776946107784432


 50%|█████     | 5/10 [00:10<00:10,  2.04s/it]

Epoch:5	train loss:32.220555767205035	train acc0.8258143017596405
Epoch:5	valid loss:256.23013395824955	valid acc0.812125748502994


 60%|██████    | 6/10 [00:12<00:07,  1.99s/it]

Epoch:6	train loss:32.220555767205035	train acc0.8072819168850618
Epoch:6	valid loss:256.23013395824955	valid acc0.7956586826347305


 70%|███████   | 7/10 [00:13<00:05,  1.96s/it]

Epoch:7	train loss:32.220555767205035	train acc0.8390116061400225
Epoch:7	valid loss:256.23013395824955	valid acc0.8270958083832335


 80%|████████  | 8/10 [00:15<00:03,  1.94s/it]

Epoch:8	train loss:32.220555767205035	train acc0.8483713964807188
Epoch:8	valid loss:256.23013395824955	valid acc0.8398203592814372


 90%|█████████ | 9/10 [00:18<00:02,  2.02s/it]

Epoch:9	train loss:32.220555767205035	train acc0.8534256832646949
Epoch:9	valid loss:256.23013395824955	valid acc0.8315868263473054


100%|██████████| 10/10 [00:20<00:00,  2.01s/it]

Epoch:10	train loss:32.220555767205035	train acc0.8522089105204044
Epoch:10	valid loss:256.23013395824955	valid acc0.8278443113772455





# 85. 双方向RNN・多層化
順方向と逆方向のRNNの両方を用いて入力テキストをエンコードし，モデルを学習せよ．
さらに，双方向RNNを多層化して実験せよ．

### MEMO
- 双方向RNN：中間層の出力を未来への伝播と過去への逆伝播の両方向に伝播するネットワーク
- https://pytorch.org/docs/master/generated/torch.cat.html

In [32]:
#双方向RNN
class BidirectionalRNN(nn.Module):
    def __init__(self,input_size,emb_size,hidden_size,output_size,weights):
        super().__init__()
        #nn.Embedding.weightで重みのセット
        self.emb = nn.Embedding.from_pretrained(weights,padding_idx=padding_idx)
        self.rnn = nn.RNN(emb_size,hidden_size,batch_first=True,bidirectional=True)
        #readout layer
        #num_layers * num_directions, batch, hidden_size
        self.fc = nn.Linear(2*hidden_size,output_size)
        #self.softmax = nn.Softmax(dim=-1)

    def forward(self,x,x_len):
        emb = self.emb(x)
        packed_emb = pack_padded_sequence(emb, x_len, batch_first=True,enforce_sorted=False)
        #print('packed emb shape:',packed_emb)
        output, h = self.rnn(packed_emb)
        #print('h:',h.shape)
        
        #データの最後だけを全結合層に送り込む
        #We should take output[-1, :, :hidden_size] (normal RNN) and output[0, :, hidden_size:] (reverse RNN), concatenate them
        #torch.cat(tensors, dim=0, *, out=None),連結する軸をdimで指定する
        #last layer of foward/backward
        h_f , h_b = h[-2] , h[-1]
        y = self.fc(torch.cat([h_f,h_b],1))  
        return y

In [33]:
#確認
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 1
weights = weights_with_word2vec

#適当な系列を準備
f = "./data/train2.txt"
titles , labels = get_x_data(f,vocab)
x, x_len, y = get_data(titles,labels)
x = torch.squeeze(x)
ds = TensorDataset(x,x_len,y)

bi_rnn = BidirectionalRNN(input_size,emb_size,hidden_size,output_size,weights)
print(bi_rnn(x,x_len))

tensor([[-0.1820,  0.0124, -0.2815, -0.1403],
        [-0.0109,  0.2259,  0.1550, -0.0236],
        [-0.0969,  0.0236, -0.1268,  0.2704],
        [-0.2930,  0.0014, -0.1283,  0.0967],
        [-0.3449,  0.0022,  0.1767,  0.3557],
        [-0.0879, -0.1100, -0.1798,  0.1042],
        [-0.0749,  0.0277, -0.0839, -0.1257],
        [-0.0268,  0.0493,  0.0313,  0.1637],
        [ 0.1645, -0.0099,  0.0639,  0.2855],
        [-0.0580, -0.0333,  0.1374,  0.2007]], grad_fn=<AddmmBackward>)


In [34]:
#損失と正解率の計算
def calc_loss_and_acc(rnn, loss_func, dataset):
    
    data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
    loss = 0
    correct = 0
    total = 0

    for x, x_len, y in data_loader:
        logit = rnn.forward(x,x_len)
        loss += loss_func(logit, y).item()
        total += len(y)
        pred = torch.argmax(logit, dim=-1)
        correct += torch.sum(pred == y).item()
    
    loss = total_loss / len(data_loader)
    acc = correct / total
    return loss, acc

def train_rnn(batch_size, num_epoch, learning_rate, train_ds, valid_ds, output_file,rnn):
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)


    #訓練
    for epoch in tqdm(range(1,num_epoch+1)):
        for x, x_len, y in train_loader:
            optimizer.zero_grad()
            logit = rnn(x,x_len)
            loss = loss_func(logit, y)
            loss.backward()
            optimizer.step()

        train_loss , train_acc = calc_loss_and_acc(rnn,loss_func,train_ds)
        valid_loss , valid_acc = calc_loss_and_acc(rnn,loss_func,valid_ds)

        print("Epoch:{}\ttrain loss:{}\ttrain acc{}".format(epoch, train_loss, train_acc))        
        print("Epoch:{}\tvalid loss:{}\tvalid acc{}".format(epoch, valid_loss, valid_acc))

    #model.state_dictメソッドでモデルのパラメータを保存できる
    #optimizer.state_dict()→内部状態を保存するメソッド
    torch.save({'model_state_dict':rnn.state_dict(), 'optimizer_state_dict':optimizer.state_dict()} , output_file)

In [35]:
#実験
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 32
num_epoch = 10
lr = 0.01
num_layers = 3

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x)
train_ds = TensorDataset(train_x,train_x_len,train_y,)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x)
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)

bi_rnn = BidirectionalRNN(input_size,emb_size,hidden_size,output_size,weights)
#bm_rnn = Bi_multi_RNN(input_size,emb_size,hidden_size,output_size,weights,num_layers)

train_rnn(batch_size, num_epoch, lr, train_ds, valid_ds,"./work/model85-1.pt",bi_rnn)


 10%|█         | 1/10 [00:02<00:26,  2.93s/it]

Epoch:1	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:1	valid loss:256.23013395824955	valid acc0.12724550898203593


 20%|██        | 2/10 [00:06<00:24,  3.11s/it]

Epoch:2	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:2	valid loss:256.23013395824955	valid acc0.12724550898203593


 30%|███       | 3/10 [00:09<00:22,  3.26s/it]

Epoch:3	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:3	valid loss:256.23013395824955	valid acc0.12724550898203593


 40%|████      | 4/10 [00:13<00:20,  3.40s/it]

Epoch:4	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:4	valid loss:256.23013395824955	valid acc0.12724550898203593


 50%|█████     | 5/10 [00:16<00:16,  3.22s/it]

Epoch:5	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:5	valid loss:256.23013395824955	valid acc0.12724550898203593


 60%|██████    | 6/10 [00:19<00:12,  3.14s/it]

Epoch:6	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:6	valid loss:256.23013395824955	valid acc0.12724550898203593


 70%|███████   | 7/10 [00:22<00:09,  3.12s/it]

Epoch:7	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:7	valid loss:256.23013395824955	valid acc0.12724550898203593


 80%|████████  | 8/10 [00:25<00:06,  3.25s/it]

Epoch:8	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:8	valid loss:256.23013395824955	valid acc0.12724550898203593


 90%|█████████ | 9/10 [00:28<00:03,  3.25s/it]

Epoch:9	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:9	valid loss:256.23013395824955	valid acc0.12724550898203593


100%|██████████| 10/10 [00:32<00:00,  3.26s/it]

Epoch:10	train loss:32.220555767205035	train acc0.1247660052414826
Epoch:10	valid loss:256.23013395824955	valid acc0.12724550898203593





In [39]:
#双方向RNNの多層化
class Bi_multi_RNN(nn.Module):
    def __init__(self,input_size,emb_size,hidden_size,output_size,weights,num_layers):
        super().__init__()
        #nn.Embedding.weightで重みのセット
        self.emb = nn.Embedding.from_pretrained(weights.float(),padding_idx=padding_idx)
        self.rnn = nn.RNN(emb_size,hidden_size,batch_first=True,bidirectional=True,num_layers=num_layers)
        #readout layer
        #num_layers * num_directions, batch, hidden_size
        self.fc = nn.Linear(2*hidden_size,output_size)
        #self.softmax = nn.Softmax(dim=-1)

    def forward(self,x,x_len):
        emb = self.emb(x)
        packed_emb = pack_padded_sequence(emb, x_len, batch_first=True,enforce_sorted=False)
        #print('packed emb shape:',packed_emb)
        output, h = self.rnn(packed_emb)
        #print('output:', output.shape)
        #データの最後だけを全結合層に送り込む
        #We should take output[-1, :, :hidden_size] (normal RNN) and output[0, :, hidden_size:] (reverse RNN), concatenate them
        h_f , h_b = h[-2] , h[-1]
        y = self.fc(torch.cat([h_f,h_b],1))
        return y

In [40]:
#確認
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 1
weights = weights_with_word2vec
num_layers = 3

#適当な系列を準備
f = "./data/train2.txt"
titles , labels = get_x_data(f,vocab)
x, x_len, y = get_data(titles,labels)
x = torch.squeeze(x)
ds = TensorDataset(x,x_len,y)

model = Bi_multi_RNN(input_size,emb_size,hidden_size,output_size,weights,num_layers)
print(model)

Bi_multi_RNN(
  (emb): Embedding(8187, 300, padding_idx=8186)
  (rnn): RNN(300, 50, num_layers=3, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


In [41]:
#実験
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
hidden_size = 50
output_size = 4
padding_idx = len(vocab)
batch_size = 32
num_epoch = 10
lr = 0.01
num_layers = 3

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x)
train_ds = TensorDataset(train_x,train_x_len,train_y,)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x)
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)

#bi_rnn = BidirectionalRNN(input_size,emb_size,hidden_size,output_size,weights)
bm_rnn = Bi_multi_RNN(input_size,emb_size,hidden_size,output_size,weights,num_layers)

train_rnn(batch_size, num_epoch, lr, train_ds, valid_ds,"./work/model85-2.pt",bm_rnn)

 10%|█         | 1/10 [00:07<01:07,  7.45s/it]

Epoch:1	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:1	valid loss:256.23013395824955	valid acc0.19236526946107785


 20%|██        | 2/10 [00:15<01:00,  7.59s/it]

Epoch:2	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:2	valid loss:256.23013395824955	valid acc0.19236526946107785


 30%|███       | 3/10 [00:22<00:53,  7.71s/it]

Epoch:3	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:3	valid loss:256.23013395824955	valid acc0.19236526946107785


 40%|████      | 4/10 [00:30<00:45,  7.58s/it]

Epoch:4	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:4	valid loss:256.23013395824955	valid acc0.19236526946107785


 50%|█████     | 5/10 [00:37<00:36,  7.32s/it]

Epoch:5	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:5	valid loss:256.23013395824955	valid acc0.19236526946107785


 60%|██████    | 6/10 [00:44<00:29,  7.39s/it]

Epoch:6	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:6	valid loss:256.23013395824955	valid acc0.19236526946107785


 70%|███████   | 7/10 [00:51<00:21,  7.33s/it]

Epoch:7	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:7	valid loss:256.23013395824955	valid acc0.19236526946107785


 80%|████████  | 8/10 [00:59<00:14,  7.37s/it]

Epoch:8	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:8	valid loss:256.23013395824955	valid acc0.19236526946107785


 90%|█████████ | 9/10 [01:07<00:07,  7.46s/it]

Epoch:9	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:9	valid loss:256.23013395824955	valid acc0.19236526946107785


100%|██████████| 10/10 [01:13<00:00,  7.39s/it]

Epoch:10	train loss:32.220555767205035	train acc0.1956196181205541
Epoch:10	valid loss:256.23013395824955	valid acc0.19236526946107785





# 86. 畳み込みニューラルネットワーク (CNN)
ID番号で表現された単語列x=(x1,x2,…,xT)がある．ただし，Tは単語列の長さ，xt∈ℝVは単語のID番号のone-hot表記である（Vは単語の総数である）．畳み込みニューラルネットワーク（CNN: Convolutional Neural Network）を用い，単語列xからカテゴリyを予測するモデルを実装せよ．

### MEMO
- 単語埋め込みの次元数: dw
- 畳み込みのフィルターのサイズ: 3 トークン
- 畳み込みのストライド: 1 トークン
- 畳み込みのパディング: あり
- 畳み込み演算後の各時刻のベクトルの次元数: dh
- 畳み込み演算後に最大値プーリング（max pooling）を適用し，入力文をdh次元の隠れベクトルで表現
- http://tkengo.github.io/blog/2016/03/11/understanding-convolutional-neural-networks-for-nlp/
- https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
- https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
    - kernel_size (int or tuple) – Size of the convolving kernel　（スライド窓、フィルタ）
    - stride (int or tuple, optional) – Stride of the convolution. Default: 1
        - どれくらいフィルタをシフトするのか（ここでは１）
- 畳み込み層とプーリング層を繰り返す
- プーリングとは学習サイズを決められたルールにしたがって小さくすること
     - ここでは最大値プーリング→各フィルタの最大値を得る
- 可視化したサイトがわかりやすかった　https://stackoverflow.com/questions/56675943/meaning-of-parameters-in-torch-nn-conv2d

In [42]:
#CNNの構築
class CNN(nn.Module):
    def __init__(self,input_size,emb_size,d_h,output_size,stride,padding_idx,padding):
        super(CNN,self).__init__()
        self.emb = nn.Embedding.from_pretrained(weights.float(),padding_idx=padding_idx)
        #channl=1,conv2d:2次元の畳み込み
        self.conv = nn.Conv2d(1,d_h, (window_size, emb_size), stride ,(padding,0))
        self.fc = nn.Linear(d_h,output_size)
       

    def forward(self,x):
        #input_size= (batch_size,a number of channels, height of input,width)
        #print(x.size())
        emb = self.emb(x)
        #print('emb:',emb.size())
        emb = self.emb(x).unsqueeze(1)
        #print('emb:',emb.size())
        #conv:input size(batch_size,C_in=1,H(系列長),W(emb_size))
        conv = self.conv(emb).squeeze(3)
        #conv:output size(batch_size, C_out=d_h, H_out=系列長)
        #print('conv:',conv.size())
        relu = F.relu(conv)
        #max_pool:input size(batch_size, C=d_h, L=系列長)
        #max_pool1d(inputs, kernel_size, stride)
        max_pool = F.max_pool1d(conv, conv.size(2))
        #print('max_pool:',max_pool.size())
        y = self.fc(max_pool.squeeze(2))
        #print('y:',y.size())
        return y

In [43]:
#確認
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
d_h = 50
window_size = 3
stride = 1
padding = 1
output_size = 4
padidng_idx = len(vocab)
padding = 1 

model = CNN(input_size,emb_size,d_h,output_size,stride,padding_idx,padding)
print(model)

train_data = "./data/train.txt"
words = tokenize_title(train_data)
x = torch.tensor(word2id(words,vocab)).unsqueeze(0)
print(x)

print(model(x))

CNN(
  (emb): Embedding(8187, 300, padding_idx=8186)
  (conv): Conv2d(1, 50, kernel_size=(3, 300), stride=(1, 1), padding=(1, 0))
  (fc): Linear(in_features=50, out_features=4, bias=True)
)
tensor([[  71,    4,  182,  ...,   45,   84, 2702]])
tensor([[ 0.1931,  0.3718, -0.2120,  0.2986]], grad_fn=<AddmmBackward>)


# 87. 確率的勾配降下法によるCNNの学習
確率的勾配降下法（SGD: Stochastic Gradient Descent）を用いて，問題86で構築したモデルを学習せよ．訓練データ上の損失と正解率，評価データ上の損失と正解率を表示しながらモデルを学習し，適当な基準（例えば10エポックなど）で終了させよ．

In [44]:
#損失と正解率の計算
def calc_loss_and_acc(model, loss_func, dataset):
    
    data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
    loss = 0
    correct = 0
    total = 0

    for x, x_len, y in data_loader:
        logit = model.forward(x)
        loss += loss_func(logit, y).item()
        total += len(y)
        pred = torch.argmax(logit, dim=-1)
        correct += torch.sum(pred == y).item()
    
    loss = total_loss / len(data_loader)
    acc = correct / total
    return loss, acc

def train_cnn(batch_size, num_epoch, learning_rate, train_ds, valid_ds, output_file, model):
    
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)
    train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)
        
    #訓練
    for epoch in tqdm(range(1,num_epoch+1)):
        for x, x_len, y in train_loader:
            optimizer.zero_grad()
            logit = model(x)
            loss = loss_func(logit, y)
            loss.backward()
            optimizer.step()
            
        train_loss , train_acc = calc_loss_and_acc(model,loss_func,train_ds)
        valid_loss , valid_acc = calc_loss_and_acc(model,loss_func,valid_ds)

        print("Epoch:{}\ttrain loss:{}\ttrain acc{}".format(epoch, train_loss, train_acc))        
        print("Epoch:{}\tvalid loss:{}\tvalid acc{}".format(epoch, valid_loss, valid_acc))


    #model.state_dictメソッドでモデルのパラメータを保存できる
    #optimizer.state_dict()→内部状態を保存するメソッド
    torch.save({'model_state_dict':model.state_dict(), 'optimizer_state_dict':optimizer.state_dict()} , output_file)
    

In [45]:
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
d_h = 50
window_size = 3
stride = 1
padding = 1
output_size = 4
padding_idx = len(vocab)
batch_size = 32
padding = 1

model = CNN(input_size,emb_size,d_h,output_size,stride,padding_idx,padding)

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x)
train_ds = TensorDataset(train_x,train_x_len,train_y)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x)
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)


train_cnn(batch_size, 10, 0.01, train_ds, valid_ds, "./work/model87.pt", model)

 10%|█         | 1/10 [00:04<00:39,  4.39s/it]

Epoch:1	train loss:32.220555767205035	train acc0.7306252339947585
Epoch:1	valid loss:256.23013395824955	valid acc0.7245508982035929


 20%|██        | 2/10 [00:08<00:34,  4.36s/it]

Epoch:2	train loss:32.220555767205035	train acc0.7558030700112317
Epoch:2	valid loss:256.23013395824955	valid acc0.7417664670658682


 30%|███       | 3/10 [00:12<00:28,  4.08s/it]

Epoch:3	train loss:32.220555767205035	train acc0.762916510670161
Epoch:3	valid loss:256.23013395824955	valid acc0.7477544910179641


 40%|████      | 4/10 [00:16<00:23,  3.92s/it]

Epoch:4	train loss:32.220555767205035	train acc0.7669412205166605
Epoch:4	valid loss:256.23013395824955	valid acc0.7522455089820359


 50%|█████     | 5/10 [00:19<00:19,  3.84s/it]

Epoch:5	train loss:32.220555767205035	train acc0.7737738674653688
Epoch:5	valid loss:256.23013395824955	valid acc0.7567365269461078


 60%|██████    | 6/10 [00:23<00:15,  3.82s/it]

Epoch:6	train loss:32.220555767205035	train acc0.7821976787719955
Epoch:6	valid loss:256.23013395824955	valid acc0.7642215568862275


 70%|███████   | 7/10 [00:27<00:11,  3.82s/it]

Epoch:7	train loss:32.220555767205035	train acc0.8018532384874579
Epoch:7	valid loss:256.23013395824955	valid acc0.780688622754491


 80%|████████  | 8/10 [00:31<00:07,  3.82s/it]

Epoch:8	train loss:32.220555767205035	train acc0.8214152002995133
Epoch:8	valid loss:256.23013395824955	valid acc0.8016467065868264


 90%|█████████ | 9/10 [00:35<00:03,  3.80s/it]

Epoch:9	train loss:32.220555767205035	train acc0.8373268438786972
Epoch:9	valid loss:256.23013395824955	valid acc0.8203592814371258


100%|██████████| 10/10 [00:38<00:00,  3.89s/it]

Epoch:10	train loss:32.220555767205035	train acc0.8516473230999626
Epoch:10	valid loss:256.23013395824955	valid acc0.8278443113772455





# 88. パラメータチューニング
問題85や問題87のコードを改変し，ニューラルネットワークの形状やハイパーパラメータを調整しながら，高性能なカテゴリ分類器を構築せよ．

### MEMO
- Ray Tune https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html
- Optuna https://github.com/optuna/optunahttps://github.com/optuna/optuna
- Skorch https://skorch.readthedocs.io/en/latest/?badge=latesthttps://skorch.readthedocs.io/en/latest/?badge=latest を使うとsklearnのような感じでハイパラ探索が出来る

In [46]:
#87のCNNを改変(dropout層の追加)
#CNNの構築
class CNN(nn.Module):
    def __init__(self,input_size,emb_size,d_h,output_size,stride,padding_idx,padding):
        super(CNN,self).__init__()
        self.emb = nn.Embedding.from_pretrained(weights.float(),padding_idx=padding_idx)
        self.conv = nn.Conv2d(1,d_h, (window_size, emb_size), stride ,(padding,0))
        #channel数１なので結局conv1dと同じ
        #dropout rateも調整できる
        self.dropout = nn.Dropout()
        self.fc = nn.Linear(d_h,output_size)
       

    def forward(self,x):
        #input_size= (batch_size,a number of channels, height of input,width)
        emb = self.emb(x)
        emb = self.emb(x).unsqueeze(1)
        #conv:input size(batch_size,C_in=1,H(系列長),W(emb_size))
        conv = self.conv(emb).squeeze(3)
        #conv:output size(batch_size, C_out=d_h, H_out=系列長)
        relu = F.relu(conv)
        #max_pool:input size(batch_size, C=d_h, L=系列長)
        max_pool = F.max_pool1d(conv, conv.size(2))
        #dropout層の追加
        out = self.dropout(max_pool)
        y = self.fc(out.squeeze(2))
        #print('y:',y.size())
        return y

In [185]:
from skorch import NeuralNetClassifier

#グリッドサーチでハイパーパラメータチューニング(skorch：pytorchをsklearnのように使えるようにしたラッパー)
#グリッドサーチのパラメータを設定
def grid_serch(model,params):

    gs = GridSearchCV(model,params,cv=5)
    #tensorではなくてnumpyの配列を渡す
    gs.fit(train_x.numpy(),train_y.numpy())
    return gs.best_score_, gs.best_params_
               
model = NeuralNetClassifier(CNN(input_size,emb_size,d_h,output_size,stride,padding_idx,padding),)
params = {'lr':[0.001,0.01], 'batch_size':[16,32,64]}
best_model = grid_serch(model, params)
print(best_model)

In [48]:
def train_cnn(batch_size, num_epoch, learning_rate, train_ds, valid_ds, output_file, model):
    
    loss_func = nn.CrossEntropyLoss()
    #optimizerの変更
    optimizer = optim.Adam(model.parameters(), lr=lr)
    train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)
        
    #訓練
    for epoch in tqdm(range(1,num_epoch+1)):
        for x, x_len, y in train_loader:
            optimizer.zero_grad()
            logit = model(x)
            loss = loss_func(logit, y)
            loss.backward()
            optimizer.step()
            
        train_loss , train_acc = calc_loss_and_acc(model,loss_func,train_ds)
        valid_loss , valid_acc = calc_loss_and_acc(model,loss_func,valid_ds)

        print("Epoch:{}\ttrain loss:{}\ttrain acc{}".format(epoch, train_loss, train_acc))        
        print("Epoch:{}\tvalid loss:{}\tvalid acc{}".format(epoch, valid_loss, valid_acc))


In [50]:
#グリッドサーチは時間がかかる...
#一番良さげなパラメータ(lr:0.001,batch_size:32)で学習
#モデルの定義
input_size = len(vocab)+1
emb_size = 300
d_h = 50
window_size = 3
stride = 1
padding = 1
output_size = 4
padding_idx = len(vocab)
batch_size = 32
padding = 1
epoch_size = 10
lr = 0.001

model = CNN(input_size,emb_size,d_h,output_size,stride,padding_idx,padding)

train_f = "./data/train.txt"
train_titles , train_labels = get_x_data(train_f,vocab)
train_x, train_x_len, train_y = get_data(train_titles,train_labels)
train_x = torch.squeeze(train_x)
train_ds = TensorDataset(train_x,train_x_len,train_y)

valid_f = "./data/valid.txt"
valid_titles , valid_labels = get_x_data(valid_f,vocab)
valid_x, valid_x_len, valid_y = get_data(valid_titles,valid_labels)
valid_x = torch.squeeze(valid_x)
valid_ds = TensorDataset(valid_x,valid_x_len,valid_y)

train_cnn(batch_size, num_epoch, lr, train_ds, valid_ds, "./work/model88.pt", model)

 10%|█         | 1/10 [00:03<00:32,  3.63s/it]

Epoch:1	train loss:32.220555767205035	train acc0.8614751029576937
Epoch:1	valid loss:256.23013395824955	valid acc0.8510479041916168


 20%|██        | 2/10 [00:07<00:28,  3.62s/it]

Epoch:2	train loss:32.220555767205035	train acc0.8914264320479222
Epoch:2	valid loss:256.23013395824955	valid acc0.8645209580838323


 30%|███       | 3/10 [00:11<00:26,  3.75s/it]

Epoch:3	train loss:32.220555767205035	train acc0.9093036315986522
Epoch:3	valid loss:256.23013395824955	valid acc0.8817365269461078


 40%|████      | 4/10 [00:14<00:22,  3.75s/it]

Epoch:4	train loss:32.220555767205035	train acc0.9235305129165107
Epoch:4	valid loss:256.23013395824955	valid acc0.8809880239520959


 50%|█████     | 5/10 [00:18<00:18,  3.77s/it]

Epoch:5	train loss:32.220555767205035	train acc0.9311119430924747
Epoch:5	valid loss:256.23013395824955	valid acc0.8869760479041916


 60%|██████    | 6/10 [00:22<00:15,  3.80s/it]

Epoch:6	train loss:32.220555767205035	train acc0.9429052789217521
Epoch:6	valid loss:256.23013395824955	valid acc0.8922155688622755


 70%|███████   | 7/10 [00:26<00:11,  3.85s/it]

Epoch:7	train loss:32.220555767205035	train acc0.9530138524897043
Epoch:7	valid loss:256.23013395824955	valid acc0.8802395209580839


 80%|████████  | 8/10 [00:30<00:08,  4.01s/it]

Epoch:8	train loss:32.220555767205035	train acc0.956289779108948
Epoch:8	valid loss:256.23013395824955	valid acc0.8802395209580839


 90%|█████████ | 9/10 [00:35<00:04,  4.12s/it]

Epoch:9	train loss:32.220555767205035	train acc0.9622800449269936
Epoch:9	valid loss:256.23013395824955	valid acc0.8854790419161677


100%|██████████| 10/10 [00:39<00:00,  3.93s/it]

Epoch:10	train loss:32.220555767205035	train acc0.9663047547734931
Epoch:10	valid loss:256.23013395824955	valid acc0.8824850299401198





# 89. 事前学習済み言語モデルからの転移学習
事前学習済み言語モデル（例えばBERTなど）を出発点として，ニュース記事見出しをカテゴリに分類するモデルを構築せよ．

### MEMO
- 転移学習：ある領域の知識を別の領域の学習に適用させる技術, fine-tuning
- Bidirectional Encoder Representations fromTransformers
- https://github.com/google-research/bert
    - BERT is a model with absolute position embeddings so it’s usually advised to pad the inputs on the right rather than the left.
    - BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.
- BERTをpytorchで利用可能にしたのがhuggingface https://github.com/huggingface/transformers
- Traing and fine-tuning https://huggingface.co/transformers/training.html
    - hagging faceのtokenizerについてhttps://huggingface.co/transformers/main_classes/tokenizer.html
    

In [72]:
from transformers import BertTokenizer, BertModel
from transformers import AdamW

In [191]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

text_batch = ["I love Pixar.", "I don't care for Pixar."]
#padding = Trueでpadding, truncation:長すぎる部分の切り捨て
encodings = tokenizer.encode_plus(text, return_tensors='pt', padding='max_length', truncation=True)
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
print(encodings)
print(input_ids.size(),attention_mask.size())

{'input_ids': tensor([[101, 100, 100, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,

In [212]:
#BERTのモデルで訓練するためのデータセットを定義
class BertDataset(Dataset):
    def __init__(self, data_f, tokenizer):
        self.get_data_for_bert(data_f)
        self.tokenizer = tokenizer
        
    def get_data_for_bert(self, data_f):
        titles = []
        labels = []
        categories = ['b','e','t','m']
        with open(data_f) as f:
            for line in f:
                text , label = line.rstrip('\n').split('\t')
                titles.append(text)
                label = categories.index(label)
                labels.append(label)
                
        self.y = torch.tensor(labels).long()
        self.titles = titles
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self,idx):
        text = self.titles[idx]
        #pytorchのテンソルを返すように引数で指定できる
        sequence = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=50, truncation=True,)
        #input_ids:padding後のID列のtensorが返ってくる
        input_ids = sequence['input_ids']
        #attention_maskをすることで、paddingした部分を区別することができる,return tensor
        attention_mask = sequence['attention_mask']
        y = self.y[idx]

        return torch.LongTensor(input_ids), torch.LongTensor(attention_mask), y


In [213]:
#事前学習済みモデル（BERT)の構築
class BERT(nn.Module):
    def __init__(self,num_labels=4):
        super().__init__()
        #モデル名：bert-base-uncased
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        #bertのhidden_sizeはdefaultで768次元
        self.fc = nn.Linear(768,num_labels)
       
    def forward(self,input_ids, attention_mask):
        #input_idsとattention_masksのshapeは(batch_size, sequence_length)
        _ ,output  = self.bert(input_ids, attention_mask)
        y = self.fc(self.drop(output))
        #print('y:',y.size())
        return y

In [214]:
#損失と正解率の計算
def calc_loss_and_acc(model, loss_func, dataset):
    
    data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)
    loss = 0
    correct = 0
    total = 0

    for input_ids, attention_mask, y in data_loader:
        logit = model.forward(input_ids,attention)
        loss += loss_func(logit, y).item()
        total += len(y)
        pred = torch.argmax(logit, dim=-1)
        correct += torch.sum(pred == y).item()
    
    loss = total_loss / len(data_loader)
    acc = correct / total
    return loss, acc

#BERTモデルの学習を行う関数
def train_bert(batch_size, num_epoch, learning_rate, train_ds, valid_ds, output_file, model):
    
    loss_func = nn.CrossEntropyLoss()
    #optimizerの変更, ライブラリが提供しているAdamWというoptimizerを使ってみる
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    train_loader = DataLoader(train_ds,batch_size=batch_size,shuffle=True)
        
    #訓練
    for epoch in tqdm(range(1,num_epoch+1)):
        for input_ids, attention_mask, y in train_loader:
            optimizer.zero_grad()
            logit = model(input_ids,attention_mask)
            loss = loss_func(logit, y)
            loss.backward()
            optimizer.step()

        train_loss , train_acc = calc_loss_and_acc(model,loss_func,train_ds)
        valid_loss , valid_acc = calc_loss_and_acc(model,loss_func,valid_ds)

        print("Epoch:{}\ttrain loss:{}\ttrain acc{}".format(epoch, train_loss, train_acc))        
        print("Epoch:{}\tvalid loss:{}\tvalid acc{}".format(epoch, valid_loss, valid_acc))
    
    torch.save({'model_state_dict':model.state_dict(), 'optimizer_state_dict':optimizer.state_dict()} , output_file)


In [215]:
#モデルの定義
batch_size = 32
epoch_size = 10
lr = 0.001

model = BERT(num_labels)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_f = "./data/train.txt"
train_ds = BertDataset(train_f,tokenizer)

valid_f = "./data/valid.txt"
valid_ds = BertDataset(valid_f,tokenizer)

train_bert(batch_size, num_epoch, lr, train_ds, valid_ds, "./work/model89.pt", model)

#TensorDatasetも渡せるよ！dataset周りがうまく修正できれば多分動く
#全部一気にリストに入れてtokenizeすると、最大系列長に合わせられるよ

  0%|          | 0/10 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)