# RNNスクラッチ
【問題1】SimpleRNNのフォワードプロパゲーション実装

In [1]:
# import
import numpy as np
from copy import deepcopy

# 全結合層
class FC:
    """
    ノード数n_nodes1からn_nodes2への全結合層
    Parameters
    ----------
    n_nodes1 : int
      前の層のノード数
    n_nodes2 : int
      後の層のノード数
    initializer : 初期化方法のインスタンス
    optimizer : 最適化手法のインスタンス
    """
    # initial define
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        #  SGDrnn, Adam, Adagrad, Momemtum
        self.optimizer = optimizer
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
        self.Z = 0
        self.dA = 0
        
        
    def forward(self, X):
        """
        フォワード
        Parameters
        ----------
        X : 次の形のndarray, shape (batch_size, n_nodes1)
            入力
        Returns
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes2)
            出力
        """ 
        self.Z = deepcopy(X)
        A = np.dot(X, self.W) + self.B
        return A
    
    def backward(self, dA):
        """
        バックワード
        Parameters
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes2)
            後ろから流れてきた勾配
        Returns
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes1)
            前に流す勾配
        """
        self.dA = deepcopy(dA)
        dW = np.dot(self.Z.T, dA)
        dZ = np.dot(dA, self.W.T) 
        # 更新
        self = self.optimizer.update(self)
        
        return dZ

In [2]:
# Initializer
class SimpleInitializer:
    """
    ガウス分布によるシンプルな初期化
    Parameters
    ----------
    sigma : float
      ガウス分布の標準偏差
    """
    # sigma=0.01
    def __init__(self, sigma=0.01):
        self.sigma = sigma
    # 
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W : 次の形のndarray, shape(n_nodes1, n_nodes2)
        """
        # 重み
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        return W.astype("f") # f32 == f, astype
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B : 次の形のndarray, shape(1, nodes2)
        """
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B.astype("f")

# ザビエル
class XavierInitializer:
    """
    Xavierによる初期化
    Sigmoid」かTanhに向いている
    """
    
    def __init__(self):
        self.sigma = None
        
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W : 次の形のndarray, shape(n_nodes1, n_nodes2)
        """
        self.sigma = 1 / np.sqrt(n_nodes1)
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        return W.astype("f")
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B : 次の形のndarray, shape(1, nodes2)
        """
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B.astype("f")

# フー
class HeInitializer:
    """
    Heによる初期化
    ReLUと相性がいい
    """
    
    def __init__(self):
        self.sigma = 0
        
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W : 次の形のndarray, shape(n_nodes1, n_nodes2)
        """
        self.sigma = np.sqrt(2 / n_nodes1)
        W = (self.sigma * np.random.randn(n_nodes1, n_nodes2))
        return W.astype("f")
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B : 次の形のndarray, shape(1, nodes2)
        """
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B.astype("f")

In [3]:
class SGDrnn:
    """
    確率的勾配降下法
    Parameters
    ----------
    lr : 学習率
    """
    def __init__(self, lr):
        self.lr = lr
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """
        layer.WX[...] = layer.WX - self.lr * np.dot(layer.X.T, layer.dA) / len(layer.dA)
        layer.B[...] = layer.B - self.lr * np.mean(layer.dA)
        layer.Wh[...] = layer.Wh[...] - self.lr * np.dot(layer.ht.T, layer.dA) / len(layer.dA)
        return layer
    
class SGD:
    """
    確率的勾配降下法
    Parameters
    ----------
    lr : 学習率
    """
    def __init__(self, lr):
        self.lr = lr
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """
        layer.W[...] = layer.W - self.lr * np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        layer.B[...] = layer.B - self.lr * np.mean(layer.dA, axis=0)
        return layer


class AdaGrad:
    """
    学習率を変化を減少させていく勾配降下法
    Parameters
    ----------
    lr : 学習率
    """
    def __init__(self, lr):
        self.lr = lr
        self.HW = 0
        self.HB = 0
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """
        
        dW = np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        dB = np.mean(layer.dA, axis=0)
        self.HW += dW**2
        self.HB +=  dB**2
        layer.W[...] = layer.W - self.lr / np.sqrt(self.HW +1e-7) * dW #0で割るとまずいので +le-7
        layer.B[...] = layer.B - self.lr / np.sqrt(self.HB + 1e-7)  * dB
        return layer
    
class Momentum:
    
    """
    momentumSGD
    Parameters
    ----------
    lr : 学習率
    momentum : 学習係数
    """
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.vW = 0
        self.vB = 0
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """

        dW = np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        dB = np.mean(layer.dA, axis=0)
        
        self.vW = self.momentum * self.vW - self.lr * dW
        self.vB =  self.momentum * self.vB - self.lr * dB
        
        layer.W[...] = layer.W + self.vW
        layer.B[...] = layer.B + self.vB
        
        return layer
    
class Adam:

    """
    Adam
    RMSprop に Momentum 法を組み合わせたような形
    Parameters
    ----------
    lr : 学習率
    momentum : 学習係数
    beta1
    beta2
    """

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.mW = 0
        self.vW = 0
        self.mB = 0
        self.vB = 0
        
    def update(self, layer):
        
        self.iter += 1
        dW = np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        dB = np.mean(layer.dA, axis=0)
        
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter) 
        
        self.mW += (1 - self.beta1) * (dW - self.mW)
        self.vW += (1 - self.beta2) * (dW**2 - self.vW)
        self.mB += (1 - self.beta1) * (dB - self.mB)
        self.vB += (1 - self.beta2) * (dB**2 - self.vB)
        
        layer.W -= lr_t * self.mW / (np.sqrt(self.vW) + 1e-7)
        layer.B -= lr_t * self.mB / (np.sqrt(self.vB) + 1e-7)

In [4]:
# activation
class sigmoid:
    """
    シグモイド関数
    """
    
    def __init__(self):
        self.Z = 0
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        Z = 1 / (1 + np.exp(-A))
        self.Z = Z
        return Z
    
    def backward(self, dZ):
        """
        バックワード
        Parameters
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes)
            前に流す勾配
        """
        dA = dZ  *  (1 - self.Z) * self.Z 
        return dA
    
class Tanh:
    """
    ハイパボリックタンジェント関数
    """
    
    def __init__(self):
        self.Z = 0
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        Z = np.tanh(A)
        self.Z = Z
        return Z
    
    def backward(self, dZ):
        """
        バックワード
        Parameters
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes)
            前に流す勾配
        """
        dA = dZ  *  (1 - self.Z**2)
        return dA

class Softmax:
    """
    ソフトマックス関数
    """
    
    def __init__(self):
        self.Z = 0
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        
        c = np.max(A)
        A = A - c
        ex = np.exp(A)
        Z = ex / (np.sum(ex, axis=1))[:, np.newaxis]
        self.Z = Z
        return Z
    
    def backward(self, y):
        """
        バックワード
        Parameters
        ----------
        y : 次の形のndarray, shape (batch_size, n_class)
            正解ラベル
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_class)
            前に流す勾配
        """
        
        dA = self.Z - y
        
        return dA
    
class ReLU:
    """
    ReLU関数
    """
    
    def __init__(self):
        self.Z = None
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        Z = np.maximum(0, A)
        self.Z = deepcopy(Z)
        return Z
    
    def backward(self, dZ):
        """
        バックワード
        Parameters
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes)
            前に流す勾配
        """
        
        dA = dZ  *  np.where(self.Z != 0, 1, self.Z)
        
        return dA

In [5]:
# 引数を問題１用に作り変えました
class SimpleRNN:
    """
    RNN
    出力が最終層だけ
    Parameters
    ----------
    initializer : 初期化方法のインスタンス
    optimizer : 最適化手法のインスタンス
    """
    def __init__(self, n_features, n_nodes, initializer, optimizer, activation, w_x, w_h, b):
        self.optimizer = optimizer
        self.activation = activation
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        self.WX = w_x
        self.Wh = w_h
        self.B = b
        self.n_nodes = n_nodes
        self.A = None
        self.ht = None
        self.Z = None
        self.dA = None
        self.X = None
        self.X_ar = None
        self.n_features = n_features

    def forward(self, X):
        """
        フォワード
        Parameters
        ----------
        X : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            入力
        Returns
        ----------
        ht : 次の形のndarray, shape (batch_size,n_nodes)
            出力
        """ 
        self.X_ar = X
        m, s, n = X.shape
        ht = np.zeros((m, self.n_nodes))
        A = np.empty((0, m, self.n_nodes))
        for i in range(s):
            ht = np.dot(X[:, i, :].reshape(m, n), self.WX) + np.dot(ht, self.Wh) + self.B
            ht = self.activation.forward(ht)
            A = np.vstack((A, ht[np.newaxis,:])) #shape (シーケンス,　バッチ、n_node)
            
        A = A.transpose(1, 0, 2)
        self.A = A
        return ht
     
    def backward(self, dA ):
        """
        バックワード
        Parameters
        ----------
        dA : 次の形のndarray, shape (batch_size,n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dX : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            前に流す勾配
        """
        #self.dA = deepcopy(dA)
        m, s, n_nodes = self.A.shape
        htd = 0
        dX = np.zeros((s, m, self.n_features))
        for i in reversed(range(s)):
            #da = da + htd
            dA = dA * (1 - self.A[:, i, :]**2)#shape (m,n_nodes)
            self.dA = dA
            self.X = self.X_ar[:, i, :]
            self.ht = self.A[:, i, :]
            self = self.optimizer.update(self)
            dA = np.dot(dA, self.Wh.T) #shape(batch, n_nodes)
            dX[i, :, :] = np.dot(dA, self.WX.T) #dot後のshape (batch, n_features)
            
        dX = dX.transpose(1,0,2)
        return dX

In [52]:
class SimpleRNN2:
    """
    RNN
    出力が最終層だけ
    Parameters
    ----------
    initializer : 初期化方法のインスタンス
    optimizer : 最適化手法のインスタンス
    """
    def __init__(self, n_features, n_nodes, initializer, optimizer, activation):
        self.optimizer = optimizer
        self.activation = activation
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        self.WX = initializer.W(n_features, n_nodes)
        self.Wh = initializer.W(n_nodes, n_nodes)
        self.B = initializer.B(1)
        self.n_nodes = n_nodes
        self.A = None
        self.ht = None
        self.Z = None
        self.dA = None
        self.X = None
        self.X_ar = None
        self.n_features = n_features

    def forward(self, X):
        """
        フォワード
        Parameters
        ----------
        X : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            入力
        Returns
        ----------
        ht : 次の形のndarray, shape (batch_size,n_nodes)
            出力
        """ 
        self.X_ar = X
        m, s, n = X.shape
        ht = np.zeros((m, self.n_nodes))
        A = np.empty((0, m, self.n_nodes))
        for i in range(s):
            ht = np.dot(X[:, i, :].reshape(m, n), self.WX) + np.dot(ht, self.Wh) + self.B
            ht = self.activation.forward(ht)
            A = np.vstack((A, ht[np.newaxis,:])) #shape (シーケンス,　バッチ、n_node)
            
        A = A.transpose(1, 0, 2)
        self.A = A
        return ht
     
    def backward(self, dA ):
        """
        バックワード
        Parameters
        ----------
        dA : 次の形のndarray, shape (batch_size,n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dX : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            前に流す勾配
        """
        #self.dA = deepcopy(dA)
        m, s, n_nodes = self.A.shape
        htd = 0
        dX = np.zeros((s, m, self.n_features))
        for i in reversed(range(s)):
            #da = da + htd
            dA = dA * (1 - self.A[:, i, :]**2)#shape (m,n_nodes)
            self.dA = dA
            self.X = self.X_ar[:, i, :]
            self.ht = self.A[:, i, :]
            self = self.optimizer.update(self)
            dA = np.dot(dA, self.Wh.T) #shape(batch, n_nodes)
            dX[i, :, :] = np.dot(dA, self.WX.T) #dot後のshape (batch, n_features)
            
        dX = dX.transpose(1,0,2)
        return dX

# 【問題2】小さな配列でのフォワードプロパゲーションの実験

In [114]:
# data sample

# trainX, testX
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100
# 
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h = np.zeros((batch_size, n_nodes))
b = np.array([1])
b

array([1])

In [57]:
n_features = x.shape[2] 
n_features

2

In [58]:
x.shape

(1, 3, 2)

In [66]:
rnn = SimpleRNN(2, 4, SimpleInitializer(), SGDrnn(lr=0.1), Tanh(), w_x, w_h, b)

In [67]:
h = rnn.forward(x)

In [68]:
h

array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])

# 映画レビューの分類

In [7]:
from importlib import reload
import sys
from imp import reload
import warnings
warnings.filterwarnings('ignore')
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

In [9]:
import pandas as pd

df1 = pd.read_csv('./input/word2vec-nlp-tutorial/labeledTrainData.tsv', delimiter="\t")
df1 = df1.drop(['id'], axis=1)
df1.head()

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


In [10]:
df2 = pd.read_csv('./input/imdb-review-dataset/imdb_master.csv',encoding="latin-1")
df2.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [11]:
df = pd.concat([df1, df2]).reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,file,label,review,sentiment,type
0,,,,With all this stuff going down at the moment w...,1.0,
1,,,,"\The Classic War of the Worlds\"" by Timothy Hi...",1.0,
2,,,,The film starts with a manager (Nicholas Bell)...,0.0,
3,,,,It must be assumed that those who praised this...,0.0,
4,,,,Superbly trashy and wondrously unpretentious 8...,1.0,


In [12]:
import re

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = re.sub(r'<br>','',text, re.UNICODE)
    text = re.sub(r'</br>','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/KawakamiYohei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/KawakamiYohei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
df1.shape

(25000, 2)

In [14]:
df2.shape

(100000, 5)

In [15]:
df.shape

(125000, 7)

In [16]:
c_documents = df['Processed_Reviews'].values

In [17]:
from janome.tokenizer import Tokenizer
t = Tokenizer()
corpus = []
for i in range(len(c_documents)):
  a = t.tokenize(c_documents[i], wakati=True)
  corpus += [a]

In [18]:
corpus[9]

['br',
 ' ',
 'br',
 ' ',
 'movie',
 ' ',
 'full',
 ' ',
 'reference',
 ' ',
 'like',
 ' ',
 'mad',
 ' ',
 'max',
 ' ',
 'ii',
 ' ',
 'wild',
 ' ',
 'one',
 ' ',
 'many',
 ' ',
 'others',
 ' ',
 'ladybug',
 ' ',
 'face',
 ' ',
 'clear',
 ' ',
 'reference',
 ' ',
 'tribute',
 ' ',
 'peter',
 ' ',
 'lorre',
 ' ',
 'movie',
 ' ',
 'masterpiece',
 ' ',
 'well',
 ' ',
 'talk',
 ' ',
 'much',
 ' ',
 'future']

In [19]:
from gensim.models import FastText
# from gensim.models import word2vec

model_ted = FastText(corpus, size=100, window=5,workers=4)  
# min_count=5,

In [20]:
model_ted.save("fasttext.model")

In [21]:
mex = 0
sample = 50
for i in range(sample):
    lenge = len(corpus[i])
    if mex < lenge:
        mex = lenge
print(mex) #文章の最大のシーケン数
s = mex

777


In [22]:
#分散表現のアレーを作成
#先頭シーケンス足りないところををゼロで埋める
X_train = np.zeros((sample, s, 100))
for i in range(sample):
    textlists = corpus[i]
    lenge = s - len(textlists)
    for j, text in enumerate(textlists):
        X_train[i, lenge+j, :] = model_ted.wv[text].reshape(-1)

In [46]:
import numpy as np
y_train = df['sentiment'].values[:sample].astype('int')
y_train = np.array(y_train[:, np.newaxis])
len(y_train)

50

In [47]:
y_train_hot = np.zeros((len(y_train), 2))
for i in range(len(y_train)):
    y_train_hot[i, y_train[i]] = 1

In [28]:
from sklearn.metrics import accuracy_score
def _cross_entropy_loss(z, y):
    z += 1e-7
    return - sum(sum(y * np.log(z))) / len(y)

def accuracy(y, y_pred):
    # accuracyを計算して返す
    return accuracy_score(y, y_pred)

def cross_entropy(self, hx, y):
        """
        クロスエントロピーの計算

        Parameters
        ----------
        hx : 次の形のndarray, shape (n_samples,)
          推定した値
        y : 次の形のndarray, shape (n_samples,)
          正解値

        Returns
        ----------
        mse : numpy.float
          平均二乗誤差
        """
        # リシェイプ
        y = y.reshape(len(y), 1)
        m = len(y)
        
        J = (1/m)*sum(sum(-(y*np.log(hx))-(1-y)*np.log(1-hx))) + (self.lamda/(2*m))*sum((sum((self.coef_)**2)))
        return J

In [57]:
rnn = SimpleRNN2(100,50,SimpleInitializer(), SGDrnn(lr=0.01), ReLU())
fc = FC(50, 2,SimpleInitializer(), SGD(lr=0.01))
sm = Softmax()

In [58]:
X_train.shape

(50, 777, 100)

In [59]:
from tqdm import tqdm

#学習
epoch = 100
for i in tqdm(range(epoch)):
    #forward
    X = rnn.forward(X_train)
    X = fc.forward(X)
    pred = sm.forward(X)
    y_pred = np.argmax(pred, axis=1)
    print(str(i+1) + " :epoch")
    print("loss")
    print(_cross_entropy_loss(pred, y_train_hot))
    print("accuracy")
    print(accuracy(y_train,y_pred))
    #back
    d = sm.backward(y_train_hot)
    d = fc.backward(d)
    d = rnn.backward(d)



  0%|          | 0/100 [00:00<?, ?it/s][A[A

  1%|          | 1/100 [00:03<05:20,  3.24s/it][A[A

1 :epoch
loss
0.6962441957302107
accuracy
0.48




  2%|▏         | 2/100 [00:06<05:13,  3.20s/it][A[A

2 :epoch
loss
0.6955498773937033
accuracy
0.48




  3%|▎         | 3/100 [00:09<05:12,  3.22s/it][A[A

3 :epoch
loss
0.6949308290595604
accuracy
0.48




  4%|▍         | 4/100 [00:12<05:07,  3.21s/it][A[A

4 :epoch
loss
0.694334354516177
accuracy
0.48




  5%|▌         | 5/100 [00:15<05:02,  3.18s/it][A[A

5 :epoch
loss
0.6937469304000855
accuracy
0.48




  6%|▌         | 6/100 [00:18<04:56,  3.15s/it][A[A

6 :epoch
loss
0.6931709441697035
accuracy
0.48




  7%|▋         | 7/100 [00:22<04:51,  3.14s/it][A[A

7 :epoch
loss
0.692600075508215
accuracy
0.52




  8%|▊         | 8/100 [00:25<04:49,  3.15s/it][A[A

8 :epoch
loss
0.6920340336446302
accuracy
0.54




  9%|▉         | 9/100 [00:28<04:45,  3.13s/it][A[A

9 :epoch
loss
0.6914643843449096
accuracy
0.54




 10%|█         | 10/100 [00:31<04:42,  3.14s/it][A[A

10 :epoch
loss
0.6908852756502114
accuracy
0.54




 11%|█         | 11/100 [00:34<04:37,  3.11s/it][A[A

11 :epoch
loss
0.6903082531197431
accuracy
0.54




 12%|█▏        | 12/100 [00:37<04:32,  3.09s/it][A[A

12 :epoch
loss
0.6897332927816353
accuracy
0.56




 13%|█▎        | 13/100 [00:40<04:30,  3.11s/it][A[A

13 :epoch
loss
0.6891627379338388
accuracy
0.58




 14%|█▍        | 14/100 [00:44<04:30,  3.15s/it][A[A

14 :epoch
loss
0.6886040613243409
accuracy
0.56




 15%|█▌        | 15/100 [00:47<04:30,  3.18s/it][A[A

15 :epoch
loss
0.6880267650184558
accuracy
0.56
16 :epoch
loss
0.6874499664167583
accuracy
0.56




 16%|█▌        | 16/100 [00:50<04:31,  3.23s/it][A[A

 17%|█▋        | 17/100 [00:54<04:35,  3.32s/it][A[A

17 :epoch
loss
0.686879823703105
accuracy
0.52




 18%|█▊        | 18/100 [00:57<04:35,  3.37s/it][A[A

18 :epoch
loss
0.6863135179429374
accuracy
0.5




 19%|█▉        | 19/100 [01:01<04:35,  3.40s/it][A[A

19 :epoch
loss
0.6857498181460261
accuracy
0.56




 20%|██        | 20/100 [01:04<04:35,  3.44s/it][A[A

20 :epoch
loss
0.6851986252602136
accuracy
0.58




 21%|██        | 21/100 [01:08<04:30,  3.42s/it][A[A

21 :epoch
loss
0.6846616059952035
accuracy
0.56




 22%|██▏       | 22/100 [01:11<04:29,  3.46s/it][A[A

22 :epoch
loss
0.6841408407215266
accuracy
0.56




 23%|██▎       | 23/100 [01:14<04:24,  3.44s/it][A[A

23 :epoch
loss
0.6836362152151022
accuracy
0.58




 24%|██▍       | 24/100 [01:18<04:19,  3.41s/it][A[A

24 :epoch
loss
0.6831404418530278
accuracy
0.58




 25%|██▌       | 25/100 [01:21<04:15,  3.40s/it][A[A

25 :epoch
loss
0.6826623590810137
accuracy
0.58




 26%|██▌       | 26/100 [01:25<04:11,  3.40s/it][A[A

26 :epoch
loss
0.6821997383238317
accuracy
0.58




 27%|██▋       | 27/100 [01:28<04:10,  3.43s/it][A[A

27 :epoch
loss
0.6817468017317697
accuracy
0.56




 28%|██▊       | 28/100 [01:32<04:07,  3.44s/it][A[A

28 :epoch
loss
0.681301893009602
accuracy
0.56
29 :epoch
loss
0.6808788006232479
accuracy
0.58




 29%|██▉       | 29/100 [01:35<04:09,  3.52s/it][A[A

30 :epoch
loss
0.6804689865047564
accuracy
0.56




 30%|███       | 30/100 [01:39<04:12,  3.60s/it][A[A

 31%|███       | 31/100 [01:43<04:10,  3.63s/it][A[A

31 :epoch
loss
0.6800708595906003
accuracy
0.56
32 :epoch
loss
0.6796829124184199
accuracy
0.56




 32%|███▏      | 32/100 [01:46<04:08,  3.65s/it][A[A

 33%|███▎      | 33/100 [01:50<04:04,  3.65s/it][A[A

33 :epoch
loss
0.679303772784146
accuracy
0.56




 34%|███▍      | 34/100 [01:54<03:59,  3.63s/it][A[A

34 :epoch
loss
0.6789365782835262
accuracy
0.58




 35%|███▌      | 35/100 [01:57<03:56,  3.65s/it][A[A

35 :epoch
loss
0.6785785865711594
accuracy
0.58






36 :epoch
loss
0.6782275335599075
accuracy
0.6


 36%|███▌      | 36/100 [02:01<03:49,  3.58s/it][A[A

37 :epoch
loss
0.677883445810669
accuracy
0.6




 37%|███▋      | 37/100 [02:04<03:48,  3.63s/it][A[A

 38%|███▊      | 38/100 [02:08<03:45,  3.64s/it][A[A

38 :epoch
loss
0.6775455151386984
accuracy
0.6




 39%|███▉      | 39/100 [02:12<03:39,  3.60s/it][A[A

39 :epoch
loss
0.6772160919337895
accuracy
0.6




 40%|████      | 40/100 [02:15<03:36,  3.62s/it][A[A

40 :epoch
loss
0.6768942302284192
accuracy
0.62
41 :epoch
loss
0.6765761101275726
accuracy
0.62




 41%|████      | 41/100 [02:19<03:32,  3.60s/it][A[A

 42%|████▏     | 42/100 [02:23<03:29,  3.62s/it][A[A

42 :epoch
loss
0.6762620744029044
accuracy
0.62
43 :epoch
loss
0.6759497400021552
accuracy
0.62




 43%|████▎     | 43/100 [02:26<03:26,  3.63s/it][A[A

 44%|████▍     | 44/100 [02:30<03:25,  3.66s/it][A[A

44 :epoch
loss
0.6756401285402694
accuracy
0.62




 45%|████▌     | 45/100 [02:33<03:19,  3.62s/it][A[A

45 :epoch
loss
0.6753318227537725
accuracy
0.64




 46%|████▌     | 46/100 [02:37<03:15,  3.61s/it][A[A

46 :epoch
loss
0.6750260758716891
accuracy
0.64




 47%|████▋     | 47/100 [02:41<03:10,  3.60s/it][A[A

47 :epoch
loss
0.6747238198661276
accuracy
0.64




 48%|████▊     | 48/100 [02:44<03:07,  3.61s/it][A[A

48 :epoch
loss
0.6744230809456743
accuracy
0.64
49 :epoch
loss
0.6741253720660043
accuracy
0.64




 49%|████▉     | 49/100 [02:48<03:03,  3.61s/it][A[A

 50%|█████     | 50/100 [02:51<02:58,  3.58s/it][A[A

50 :epoch
loss
0.6738286934087967
accuracy
0.64




 51%|█████     | 51/100 [02:55<02:54,  3.55s/it][A[A

51 :epoch
loss
0.6735325838004894
accuracy
0.62




 52%|█████▏    | 52/100 [02:58<02:50,  3.56s/it][A[A

52 :epoch
loss
0.6732366637417959
accuracy
0.62




 53%|█████▎    | 53/100 [03:02<02:47,  3.57s/it][A[A

53 :epoch
loss
0.6729411454337952
accuracy
0.62




 54%|█████▍    | 54/100 [03:06<02:45,  3.60s/it][A[A

54 :epoch
loss
0.6726480948526004
accuracy
0.62




 55%|█████▌    | 55/100 [03:09<02:43,  3.62s/it][A[A

55 :epoch
loss
0.6723553468348765
accuracy
0.62




 56%|█████▌    | 56/100 [03:13<02:37,  3.59s/it][A[A

56 :epoch
loss
0.6720611193951467
accuracy
0.62




 57%|█████▋    | 57/100 [03:17<02:34,  3.60s/it][A[A

57 :epoch
loss
0.6717666764052278
accuracy
0.62
58 :epoch
loss
0.6714746450190066
accuracy
0.62




 58%|█████▊    | 58/100 [03:20<02:34,  3.68s/it][A[A

59 :epoch
loss
0.6711861328826159
accuracy
0.62




 59%|█████▉    | 59/100 [03:24<02:32,  3.73s/it][A[A

 60%|██████    | 60/100 [03:28<02:27,  3.68s/it][A[A

60 :epoch
loss
0.6708936161553194
accuracy
0.62




 61%|██████    | 61/100 [03:31<02:22,  3.66s/it][A[A

61 :epoch
loss
0.6706012037467821
accuracy
0.62




 62%|██████▏   | 62/100 [03:35<02:18,  3.65s/it][A[A

62 :epoch
loss
0.6703095209577569
accuracy
0.62
63 :epoch
loss
0.6700190489601767
accuracy
0.62




 63%|██████▎   | 63/100 [03:39<02:14,  3.64s/it][A[A

 64%|██████▍   | 64/100 [03:42<02:09,  3.59s/it][A[A

64 :epoch
loss
0.6697311149580781
accuracy
0.62




 65%|██████▌   | 65/100 [03:45<02:03,  3.52s/it][A[A

65 :epoch
loss
0.6694434136720955
accuracy
0.62




 66%|██████▌   | 66/100 [03:49<01:57,  3.45s/it][A[A

66 :epoch
loss
0.6691567677524869
accuracy
0.62




 67%|██████▋   | 67/100 [03:52<01:52,  3.42s/it][A[A

67 :epoch
loss
0.6688717212607571
accuracy
0.62




 68%|██████▊   | 68/100 [03:55<01:48,  3.40s/it][A[A

68 :epoch
loss
0.6685870224855773
accuracy
0.62




 69%|██████▉   | 69/100 [03:59<01:44,  3.38s/it][A[A

69 :epoch
loss
0.6683030306649393
accuracy
0.62




 70%|███████   | 70/100 [04:02<01:41,  3.37s/it][A[A

70 :epoch
loss
0.668020238051708
accuracy
0.62




 71%|███████   | 71/100 [04:06<01:37,  3.38s/it][A[A

71 :epoch
loss
0.6677404690124935
accuracy
0.62




 72%|███████▏  | 72/100 [04:09<01:34,  3.37s/it][A[A

72 :epoch
loss
0.6674566139573261
accuracy
0.64




 73%|███████▎  | 73/100 [04:12<01:30,  3.36s/it][A[A

73 :epoch
loss
0.6671715177897062
accuracy
0.64




 74%|███████▍  | 74/100 [04:16<01:27,  3.36s/it][A[A

74 :epoch
loss
0.6668867833531732
accuracy
0.64




 75%|███████▌  | 75/100 [04:19<01:23,  3.36s/it][A[A

75 :epoch
loss
0.6666017236969963
accuracy
0.64




 76%|███████▌  | 76/100 [04:22<01:20,  3.35s/it][A[A

76 :epoch
loss
0.6663157320958757
accuracy
0.64




 77%|███████▋  | 77/100 [04:26<01:17,  3.35s/it][A[A

77 :epoch
loss
0.6660267993531602
accuracy
0.64




 78%|███████▊  | 78/100 [04:29<01:13,  3.34s/it][A[A

78 :epoch
loss
0.6657309449841605
accuracy
0.64




 79%|███████▉  | 79/100 [04:32<01:10,  3.34s/it][A[A

79 :epoch
loss
0.6654350743019728
accuracy
0.64




 80%|████████  | 80/100 [04:36<01:06,  3.33s/it][A[A

80 :epoch
loss
0.6651382269071181
accuracy
0.64




 81%|████████  | 81/100 [04:39<01:03,  3.33s/it][A[A

81 :epoch
loss
0.6648411135440583
accuracy
0.64




 82%|████████▏ | 82/100 [04:42<01:00,  3.37s/it][A[A

82 :epoch
loss
0.6645459823008268
accuracy
0.64




 83%|████████▎ | 83/100 [04:46<00:59,  3.49s/it][A[A

83 :epoch
loss
0.6642554234853836
accuracy
0.64




 84%|████████▍ | 84/100 [04:50<00:55,  3.49s/it][A[A

84 :epoch
loss
0.663964178300995
accuracy
0.64




 85%|████████▌ | 85/100 [04:53<00:52,  3.49s/it][A[A

85 :epoch
loss
0.6636745022335961
accuracy
0.64




 86%|████████▌ | 86/100 [04:57<00:49,  3.50s/it][A[A

86 :epoch
loss
0.663385482886935
accuracy
0.64




 87%|████████▋ | 87/100 [05:00<00:45,  3.51s/it][A[A

87 :epoch
loss
0.6631015625519571
accuracy
0.64




 88%|████████▊ | 88/100 [05:04<00:42,  3.56s/it][A[A

88 :epoch
loss
0.662821903695997
accuracy
0.64
89 :epoch
loss
0.6625409017833941
accuracy
0.64




 89%|████████▉ | 89/100 [05:07<00:39,  3.55s/it][A[A

 90%|█████████ | 90/100 [05:11<00:35,  3.53s/it][A[A

90 :epoch
loss
0.6622597338548072
accuracy
0.64




 91%|█████████ | 91/100 [05:14<00:31,  3.49s/it][A[A

91 :epoch
loss
0.6619788434257532
accuracy
0.64




 92%|█████████▏| 92/100 [05:18<00:27,  3.47s/it][A[A

92 :epoch
loss
0.6616965958904703
accuracy
0.64






93 :epoch
loss
0.6614120312703773
accuracy
0.64


 93%|█████████▎| 93/100 [05:21<00:24,  3.49s/it][A[A

 94%|█████████▍| 94/100 [05:25<00:21,  3.57s/it][A[A

94 :epoch
loss
0.6611244802674244
accuracy
0.64




 95%|█████████▌| 95/100 [05:29<00:17,  3.57s/it][A[A

95 :epoch
loss
0.6608361549308899
accuracy
0.64




 96%|█████████▌| 96/100 [05:32<00:14,  3.52s/it][A[A

96 :epoch
loss
0.6605458419035426
accuracy
0.64




 97%|█████████▋| 97/100 [05:36<00:10,  3.54s/it][A[A

97 :epoch
loss
0.6602537441188409
accuracy
0.64




 98%|█████████▊| 98/100 [05:39<00:07,  3.53s/it][A[A

98 :epoch
loss
0.659960327185771
accuracy
0.64




 99%|█████████▉| 99/100 [05:42<00:03,  3.51s/it][A[A

99 :epoch
loss
0.6596666388823137
accuracy
0.64
100 :epoch
loss
0.659371733776791
accuracy
0.64




100%|██████████| 100/100 [05:46<00:00,  3.60s/it][A[A