### 80. ID番号への変換

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
train = pd.read_csv('data/chap6/train.txt',sep='\t',header=None)
valid = pd.read_csv('data/chap6/valid.txt',sep='\t',header=None)
test = pd.read_csv('data/chap6/test.txt',sep='\t',header=None)

In [None]:
vectorizer = CountVectorizer(min_df=2)
train_title = train.iloc[:,1].str.lower()
cnt = vectorizer.fit_transform(train_title).toarray()
sm = cnt.sum(axis=0)
idx = np.argsort(sm)[::-1]
words = np.array(vectorizer.get_feature_names())[idx]

In [None]:
train_title = train[1].str.lower()
cnt = vectorizer.fit_transform(train_title).toarray()
sm = cnt.sum(axis=0)
idx = np.argsort(sm)[::-1]
words = np.array(vectorizer.get_feature_names())[idx]

In [None]:
d = dict()
for i in range(len(words)):
    d[words[i]] = i+1
def get_id(sentence):
    r = []
    for word in sentence:
        r.append(d.get(word,0))
    return r

def df2id(df):
    ids = []
    for i in df.iloc[:,1].str.lower():
        ids.append(get_id(i.split()))
    return ids

In [None]:
X_train = df2id(train)
X_valid = df2id(valid)
X_test = df2id(test)

### 81. RNNによる予測

In [None]:
import torch

In [None]:
dw = 300
dh = 50
class RNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = torch.nn.Embedding(len(words)+1,dw)
        self.rnn = torch.nn.RNN(dw,dh,batch_first=True)
        self.linear = torch.nn.Linear(dh,4)
        self.softmax = torch.nn.Softmax()
    def forward(self, x, h=None):
        x = self.emb(x)
        y, h = self.rnn(x, h)
        y = y[:,-1,:] # 最後のステップ
        y = self.linear(y)
        y = self.softmax(y)
        return y

### 82. 確率的勾配降下法による学習

In [None]:
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
%load_ext tensorboard
!rm -rf ./runs
%tensorboard --logdir ./runs
writer = SummaryWriter()

In [None]:
PAD = len(words) + 1 # tensorの長さの調整
n_vocab = len(words) + 2

def list2tensor(data, max_len):
    new = []
    for d in data:
        if len(d) > max_len:
            d = d[:max_len]
        else:
            d += [PAD] * (max_len - len(d))
        new.append(d)
    return torch.tensor(new, dtype=torch.int64)

In [None]:
def accuracy(pred, label):
    pred = np.argmax(pred.data.numpy(), axis=1)
    label = label.data.numpy()
    return (pred == label).mean()

In [None]:
dw = 300
dh = 50
class RNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.emb = torch.nn.Embedding(n_vocab,dw,padding_idx=PAD) # (語彙数(行), 次元(列))の分散表現, 最後の行がPADの分散表現
        self.rnn = torch.nn.RNN(dw,dh,batch_first=True)
        self.linear = torch.nn.Linear(dh,4)
        self.softmax = torch.nn.Softmax()
    def forward(self, x, h=None):
        x = self.emb(x)
        y, h = self.rnn(x, h)
        y = y[:,-1,:] # 最後のステップ
        y = self.linear(y)
        # y = self.softmax(y) # torch.nn.CrossEntropyLoss()がsoftmaxは含む
        return y

In [None]:
max_len = 10

X_train = df2id(train)
X_valid = df2id(valid)
X_test = df2id(test)

X_train = list2tensor(X_train,max_len)
X_valid = list2tensor(X_valid,max_len)
X_test = list2tensor(X_test,max_len)

y_train = np.loadtxt('data/chap8/y_train.txt')
y_train = torch.tensor(y_train, dtype=torch.int64)
y_valid = np.loadtxt('data/chap8/y_valid.txt')
y_valid = torch.tensor(y_valid, dtype=torch.int64)
y_test = np.loadtxt('data/chap8/y_test.txt')
y_test = torch.tensor(y_test, dtype=torch.int64)

In [None]:
model = RNN()
ds = TensorDataset(X_train, y_train)
# DataLoaderを作成
loader = DataLoader(ds, batch_size=1, shuffle=True)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(10):
    for xx, yy in loader:
        y_pred = model(xx)
        loss = loss_fn(y_pred, yy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    with torch.no_grad(): # パラメータの更新はしない
        y_pred = model(X_train)
        loss = loss_fn(y_pred, y_train) 
        writer.add_scalar('Loss/train', loss, epoch)
        writer.add_scalar('Accuracy/train', accuracy(y_pred,y_train), epoch)
        print("train")
        print (accuracy(y_pred,y_train))
        print(loss)
        
        y_pred = model(X_valid)
        loss = loss_fn(y_pred, y_valid)
        writer.add_scalar('Loss/valid', loss, epoch)
        writer.add_scalar('Accuracy/valid', accuracy(y_pred,y_valid), epoch)
        print("valid")
        print (accuracy(y_pred,y_valid))
        print(loss)

### 84. 単語ベクトルの導入

In [None]:
max_len = 10

X_train = df2id(train)
X_valid = df2id(valid)
X_test = df2id(test)

X_train = list2tensor(X_train,max_len)
X_valid = list2tensor(X_valid,max_len)
X_test = list2tensor(X_test,max_len)

y_train = np.loadtxt('data/chap8/y_train.txt')
y_train = torch.tensor(y_train, dtype=torch.int64)
y_valid = np.loadtxt('data/chap8/y_valid.txt')
y_valid = torch.tensor(y_valid, dtype=torch.int64)
y_test = np.loadtxt('data/chap8/y_test.txt')
y_test = torch.tensor(y_test, dtype=torch.int64)

In [None]:
import gensim
emb = gensim.models.KeyedVectors.load_word2vec_format('data/chap7/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
model = RNN()

In [None]:
for k,v in d.items():
    if k in emb.vocab:
        model.emb.weight[v] = torch.tensor(emb[k], dtype=torch.float32)
model.emb.weight = torch.nn.Parameter(model.emb.weight) # レイヤーのparameterにする

In [None]:
ds = TensorDataset(X_train, y_train)
# DataLoaderを作成
loader = DataLoader(ds, batch_size=1, shuffle=True)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(10):
    print(epoch)
    for xx, yy in loader:
        y_pred = model(xx)
        loss = loss_fn(y_pred, yy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    with torch.no_grad(): # パラメータの更新はしない
        y_pred = model(X_train)
        loss = loss_fn(y_pred, y_train) 
        #writer.add_scalar('Loss/train', loss, epoch)
        #writer.add_scalar('Accuracy/train', accuracy(y_pred,y_train), epoch)
        print("train")
        print (accuracy(y_pred,y_train))
        print(loss)
        
        y_pred = model(X_valid)
        loss = loss_fn(y_pred, y_valid)
        #writer.add_scalar('Loss/valid', loss, epoch)
        #writer.add_scalar('Accuracy/valid', accuracy(y_pred,y_valid), epoch)
        print("valid")
        print (accuracy(y_pred,y_valid))
        print(loss)