# RNNスクラッチ
【問題1】SimpleRNNのフォワードプロパゲーション実装

In [130]:
# import
import numpy as np
from copy import deepcopy

class FC:
    """
    ノード数n_nodes1からn_nodes2への全結合層
    Parameters
    ----------
    n_nodes1 : int
      前の層のノード数
    n_nodes2 : int
      後の層のノード数
    initializer : 初期化方法のインスタンス
    optimizer : 最適化手法のインスタンス
    """
    # initial define
    def __init__(self, n_nodes1, n_nodes2, initializer, optimizer):
        #  SGDrnn, Adam, Adagrad, Momemtum
        self.optimizer = optimizer
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        self.W = initializer.W(n_nodes1, n_nodes2)
        self.B = initializer.B(n_nodes2)
        self.Z = 0
        self.dA = 0

    def forward(self, X):
        """
        フォワード
        Parameters
        ----------
        X : 次の形のndarray, shape (batch_size, n_nodes1)
            入力
        Returns
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes2)
            出力
        """ 
        self.Z = deepcopy(X)
        A = np.dot(X, self.W) + self.B
        return A
    
    def backward(self, dA):
        """
        バックワード
        Parameters
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes2)
            後ろから流れてきた勾配
        Returns
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes1)
            前に流す勾配
        """
        self.dA = deepcopy(dA)
        dW = np.dot(self.Z.T, dA)
        dZ = np.dot(dA, self.W.T) 
        # 更新
        self = self.optimizer.update(self)
        
        return dZ

In [119]:
# Initializer
class SimpleInitializer:
    """
    ガウス分布によるシンプルな初期化
    Parameters
    ----------
    sigma : float
      ガウス分布の標準偏差
    """
    # sigma=0.01
    def __init__(self, sigma=0.01):
        self.sigma = sigma
    # 
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W : 次の形のndarray, shape(n_nodes1, n_nodes2)
        """
        # 重み
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        return W.astype("f") # f32 == f, astype
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B : 次の形のndarray, shape(1, nodes2)
        """
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B.astype("f")

# ザビエル
class XavierInitializer:
    """
    Xavierによる初期化
    Sigmoid」かTanhに向いている
    """
    
    def __init__(self):
        self.sigma = None
        
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W : 次の形のndarray, shape(n_nodes1, n_nodes2)
        """
        self.sigma = 1 / np.sqrt(n_nodes1)
        W = self.sigma * np.random.randn(n_nodes1, n_nodes2)
        return W.astype("f")
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B : 次の形のndarray, shape(1, nodes2)
        """
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B.astype("f")

# フー
class HeInitializer:
    """
    Heによる初期化
    ReLUと相性がいい
    """
    
    def __init__(self):
        self.sigma = 0
        
    def W(self, n_nodes1, n_nodes2):
        """
        重みの初期化
        Parameters
        ----------
        n_nodes1 : int
          前の層のノード数
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        W : 次の形のndarray, shape(n_nodes1, n_nodes2)
        """
        self.sigma = np.sqrt(2 / n_nodes1)
        W = (self.sigma * np.random.randn(n_nodes1, n_nodes2))
        return W.astype("f")
    
    def B(self, n_nodes2):
        """
        バイアスの初期化
        Parameters
        ----------
        n_nodes2 : int
          後の層のノード数

        Returns
        ----------
        B : 次の形のndarray, shape(1, nodes2)
        """
        B = self.sigma * np.random.randn(1, n_nodes2)
        return B.astype("f")

In [131]:
class SGDrnn:
    """
    確率的勾配降下法
    Parameters
    ----------
    lr : 学習率
    """
    def __init__(self, lr):
        self.lr = lr
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """
        layer.WX[...] = layer.WX - self.lr * np.dot(layer.X.T, layer.dA) / len(layer.dA)
        layer.B[...] = layer.B - self.lr * np.mean(layer.dA)
        layer.Wh[...] = layer.Wh[...] - self.lr * np.dot(layer.ht.T, layer.dA) / len(layer.dA)
        return layer
    
class SGD:
    """
    確率的勾配降下法
    Parameters
    ----------
    lr : 学習率
    """
    def __init__(self, lr):
        self.lr = lr
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """
        layer.W[...] = layer.W - self.lr * np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        layer.B[...] = layer.B - self.lr * np.mean(layer.dA, axis=0)
        return layer


class AdaGrad:
    """
    学習率を変化を減少させていく勾配降下法
    Parameters
    ----------
    lr : 学習率
    """
    def __init__(self, lr):
        self.lr = lr
        self.HW = 0
        self.HB = 0
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """
        
        dW = np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        dB = np.mean(layer.dA, axis=0)
        self.HW += dW**2
        self.HB +=  dB**2
        layer.W[...] = layer.W - self.lr / np.sqrt(self.HW +1e-7) * dW #0で割るとまずいので +le-7
        layer.B[...] = layer.B - self.lr / np.sqrt(self.HB + 1e-7)  * dB
        return layer
    
class Momentum:
    
    """
    momentumSGD
    Parameters
    ----------
    lr : 学習率
    momentum : 学習係数
    """
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.vW = 0
        self.vB = 0
        
    def update(self, layer):
        """
        ある層の重みやバイアスの更新
        Parameters
        ----------
        layer : 更新前の層のインスタンス

        Returns
        ----------
        layer : 更新後の層のインスタンス
        """

        dW = np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        dB = np.mean(layer.dA, axis=0)
        
        self.vW = self.momentum * self.vW - self.lr * dW
        self.vB =  self.momentum * self.vB - self.lr * dB
        
        layer.W[...] = layer.W + self.vW
        layer.B[...] = layer.B + self.vB
        
        return layer
    
class Adam:

    """
    Adam
    RMSprop に Momentum 法を組み合わせたような形
    Parameters
    ----------
    lr : 学習率
    momentum : 学習係数
    beta1
    beta2
    """

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.mW = 0
        self.vW = 0
        self.mB = 0
        self.vB = 0
        
    def update(self, layer):
        
        self.iter += 1
        dW = np.dot(layer.Z.T, layer.dA) / len(layer.dA)
        dB = np.mean(layer.dA, axis=0)
        
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter) 
        
        self.mW += (1 - self.beta1) * (dW - self.mW)
        self.vW += (1 - self.beta2) * (dW**2 - self.vW)
        self.mB += (1 - self.beta1) * (dB - self.mB)
        self.vB += (1 - self.beta2) * (dB**2 - self.vB)
        
        layer.W -= lr_t * self.mW / (np.sqrt(self.vW) + 1e-7)
        layer.B -= lr_t * self.mB / (np.sqrt(self.vB) + 1e-7)

In [121]:
# activation
class sigmoid:
    """
    シグモイド関数
    """
    
    def __init__(self):
        self.Z = 0
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        Z = 1 / (1 + np.exp(-A))
        self.Z = Z
        return Z
    
    def backward(self, dZ):
        """
        バックワード
        Parameters
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes)
            前に流す勾配
        """
        dA = dZ  *  (1 - self.Z) * self.Z 
        return dA
    
class Tanh:
    """
    ハイパボリックタンジェント関数
    """
    
    def __init__(self):
        self.Z = 0
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        Z = np.tanh(A)
        self.Z = Z
        return Z
    
    def backward(self, dZ):
        """
        バックワード
        Parameters
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes)
            前に流す勾配
        """
        dA = dZ  *  (1 - self.Z**2)
        return dA

class Softmax:
    """
    ソフトマックス関数
    """
    
    def __init__(self):
        self.Z = 0
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        
        c = np.max(A)
        A = A - c
        ex = np.exp(A)
        Z = ex / (np.sum(ex, axis=1))[:, np.newaxis]
        self.Z = Z
        return Z
    
    def backward(self, y):
        """
        バックワード
        Parameters
        ----------
        y : 次の形のndarray, shape (batch_size, n_class)
            正解ラベル
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_class)
            前に流す勾配
        """
        
        dA = self.Z - y
        
        return dA
    
class ReLU:
    """
    ReLU関数
    """
    
    def __init__(self):
        self.Z = None
    
    def forward(self, A):
        """
        フォワード
        Parameters
        ----------
        A : 次の形のndarray, shape (batch_size, n_nodes)
            入力
        Returns
        ----------
        Z : 次の形のndarray, shape (batch_size, n_nodes)
            出力
        """ 
        Z = np.maximum(0, A)
        self.Z = deepcopy(Z)
        return Z
    
    def backward(self, dZ):
        """
        バックワード
        Parameters
        ----------
        dZ : 次の形のndarray, shape (batch_size, n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dA : 次の形のndarray, shape (batch_size, n_nodes)
            前に流す勾配
        """
        
        dA = dZ  *  np.where(self.Z != 0, 1, self.Z)
        
        return dA

In [122]:
# 引数を問題１用に作り変えました
class SimpleRNN:
    """
    RNN
    出力が最終層だけ
    Parameters
    ----------
    initializer : 初期化方法のインスタンス
    optimizer : 最適化手法のインスタンス
    """
    def __init__(self, n_features, n_nodes, initializer, optimizer, activation, w_x, w_h, b):
        self.optimizer = optimizer
        self.activation = activation
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        self.WX = w_x
        self.Wh = w_h
        self.B = b
        self.n_nodes = n_nodes
        self.A = None
        self.ht = None
        self.Z = None
        self.dA = None
        self.X = None
        self.X_ar = None
        self.n_features = n_features

    def forward(self, X):
        """
        フォワード
        Parameters
        ----------
        X : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            入力
        Returns
        ----------
        ht : 次の形のndarray, shape (batch_size,n_nodes)
            出力
        """ 
        self.X_ar = X
        m, s, n = X.shape
        ht = np.zeros((m, self.n_nodes))
        A = np.empty((0, m, self.n_nodes))
        for i in range(s):
            ht = np.dot(X[:, i, :].reshape(m, n), self.WX) + np.dot(ht, self.Wh) + self.B
            ht = self.activation.forward(ht)
            A = np.vstack((A, ht[np.newaxis,:])) #shape (シーケンス,　バッチ、n_node)
            
        A = A.transpose(1, 0, 2)
        self.A = A
        return ht
     
    def backward(self, dA ):
        """
        バックワード
        Parameters
        ----------
        dA : 次の形のndarray, shape (batch_size,n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dX : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            前に流す勾配
        """
        #self.dA = deepcopy(dA)
        m, s, n_nodes = self.A.shape
        htd = 0
        dX = np.zeros((s, m, self.n_features))
        for i in reversed(range(s)):
            #da = da + htd
            dA = dA * (1 - self.A[:, i, :]**2)#shape (m,n_nodes)
            self.dA = dA
            self.X = self.X_ar[:, i, :]
            self.ht = self.A[:, i, :]
            self = self.optimizer.update(self)
            dA = np.dot(dA, self.Wh.T) #shape(batch, n_nodes)
            dX[i, :, :] = np.dot(dA, self.WX.T) #dot後のshape (batch, n_features)
            
        dX = dX.transpose(1,0,2)
        return dX

In [185]:
class SimpleRNN2:
    """
    RNN
    出力が最終層だけ
    Parameters
    ----------
    initializer : 初期化方法のインスタンス
    optimizer : 最適化手法のインスタンス
    """
    def __init__(self, n_features, n_nodes, initializer, optimizer, activation):
        self.optimizer = optimizer
        self.activation = activation
        # 初期化
        # initializerのメソッドを使い、self.Wとself.Bを初期化する
        self.WX = initializer.W(n_features, n_nodes)
        self.Wh = initializer.W(n_nodes, n_nodes)
        self.B = initializer.B(1)
        self.n_nodes = n_nodes
        self.A = None
        self.ht = None
        self.Z = None
        self.dA = None
        self.X = None
        self.X_ar = None
        self.n_features = n_features

    def forward(self, X):
        """
        フォワード
        Parameters
        ----------
        X : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            入力
        Returns
        ----------
        ht : 次の形のndarray, shape (batch_size,n_nodes)
            出力
        """ 
        self.X_ar = X
        m, s, n = X.shape
        print(m)
        print(s)
        print(n)
        ht = np.zeros((m, self.n_nodes))
        A = np.empty((0, m, self.n_nodes))
        for i in range(s):
            ht = np.dot(X[:, i, :].reshape(m, n), self.WX) + np.dot(ht, self.Wh) + self.B
            ht = self.activation.forward(ht)
            
            A = np.vstack((A, ht[np.newaxis,:])) #shape (シーケンス,　バッチ、n_node)
            
        print(ht)
        print('----------------')
            
        A = A.transpose(1, 0, 2)
        self.A = A
        return ht
     
    def backward(self, dA ):
        """
        バックワード
        Parameters
        ----------
        dA : 次の形のndarray, shape (batch_size,n_nodes)
            後ろから流れてきた勾配
        Returns
        ----------
        dX : 次の形のndarray, shape (batch_size, n_sequences, n_features)
            前に流す勾配
        """
        #self.dA = deepcopy(dA)
        m, s, n_nodes = self.A.shape
        htd = 0
        dX = np.zeros((s, m, self.n_features))
        for i in reversed(range(s)):
            #da = da + htd
            dA = dA * (1 - self.A[:, i, :]**2)#shape (m,n_nodes)
            self.dA = dA
            self.X = self.X_ar[:, i, :]
            self.ht = self.A[:, i, :]
            self = self.optimizer.update(self)
            dA = np.dot(dA, self.Wh.T) #shape(batch, n_nodes)
            dX[i, :, :] = np.dot(dA, self.WX.T) #dot後のshape (batch, n_features)
            
        dX = dX.transpose(1,0,2)
        return dX

# 【問題2】小さな配列でのフォワードプロパゲーションの実験

In [114]:
# data sample

# trainX, testX
x = np.array([[[1, 2], [2, 3], [3, 4]]])/100
# 
w_x = np.array([[1, 3, 5, 7], [3, 5, 7, 8]])/100
w_h = np.array([[1, 3, 5, 7], [2, 4, 6, 8], [3, 5, 7, 8], [4, 6, 8, 10]])/100
batch_size = x.shape[0] # 1
n_sequences = x.shape[1] # 3
n_features = x.shape[2] # 2
n_nodes = w_x.shape[1] # 4
h = np.zeros((batch_size, n_nodes))
b = np.array([1])
b

array([1])

In [57]:
n_features = x.shape[2] 
n_features

2

In [58]:
x.shape

(1, 3, 2)

In [66]:
rnn = SimpleRNN(2, 4, SimpleInitializer(), SGDrnn(lr=0.1), Tanh(), w_x, w_h, b)

In [67]:
h = rnn.forward(x)

In [68]:
h

array([[0.79494228, 0.81839002, 0.83939649, 0.85584174]])

# 映画レビューの分類

In [75]:
from importlib import reload
import sys
from imp import reload
import warnings
warnings.filterwarnings('ignore')
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

In [76]:
import pandas as pd

df1 = pd.read_csv('./input/word2vec-nlp-tutorial/labeledTrainData.tsv', delimiter="\t")
df1 = df1.drop(['id'], axis=1)
df1.head()

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


In [77]:
df2 = pd.read_csv('./input/imdb-review-dataset/imdb_master.csv',encoding="latin-1")
df2.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [78]:
df = pd.concat([df1, df2]).reset_index(drop=True)
df.head()

Unnamed: 0.1,Unnamed: 0,file,label,review,sentiment,type
0,,,,With all this stuff going down at the moment w...,1.0,
1,,,,"\The Classic War of the Worlds\"" by Timothy Hi...",1.0,
2,,,,The film starts with a manager (Nicholas Bell)...,0.0,
3,,,,It must be assumed that those who praised this...,0.0,
4,,,,Superbly trashy and wondrously unpretentious 8...,1.0,


In [79]:
import re

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()


def clean_text(text):
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    text = re.sub(r'<br>','',text, re.UNICODE)
    text = re.sub(r'</br>','',text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/KawakamiYohei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/KawakamiYohei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [80]:
df1.shape

(25000, 2)

In [81]:
df2.shape

(100000, 5)

In [82]:
df.shape

(125000, 7)

In [83]:
c_documents = df['Processed_Reviews'].values

In [84]:
from janome.tokenizer import Tokenizer
t = Tokenizer()
corpus = []
for i in range(len(c_documents)):
  a = t.tokenize(c_documents[i], wakati=True)
  corpus += [a]

In [85]:
corpus[9]

['br',
 ' ',
 'br',
 ' ',
 'movie',
 ' ',
 'full',
 ' ',
 'reference',
 ' ',
 'like',
 ' ',
 'mad',
 ' ',
 'max',
 ' ',
 'ii',
 ' ',
 'wild',
 ' ',
 'one',
 ' ',
 'many',
 ' ',
 'others',
 ' ',
 'ladybug',
 ' ',
 'face',
 ' ',
 'clear',
 ' ',
 'reference',
 ' ',
 'tribute',
 ' ',
 'peter',
 ' ',
 'lorre',
 ' ',
 'movie',
 ' ',
 'masterpiece',
 ' ',
 'well',
 ' ',
 'talk',
 ' ',
 'much',
 ' ',
 'future']

In [141]:
from gensim.models import FastText
# from gensim.models import word2vec

model_ted = FastText(corpus, size=100, window=5,workers=4)  
# min_count=5,

In [142]:
model_ted.save("fasttext.model")

In [143]:
mex = 0
sample = 50
for i in range(sample):
    lenge = len(corpus[i])
    if mex < lenge:
        mex = lenge
print(mex) #文章の最大のシーケン数
s = mex

777


In [144]:
#分散表現のアレーを作成
#先頭シーケンス足りないところををゼロで埋める
X_train = np.zeros((sample, s, 100))
for i in range(sample):
    textlists = corpus[i]
    lenge = s - len(textlists)
    for j, text in enumerate(textlists):
        X_train[i, lenge+j, :] = model_ted.wv[text].reshape(-1)

In [145]:
y_train = df['sentiment'].values[:sample]

In [146]:
from sklearn.metrics import accuracy_score
def _cross_entropy_loss(z, y):
    z += 1e-7
    return - sum(sum(y * np.log(z))) / len(y)

def accuracy(y, y_pred):
    # accuracyを計算して返す
    return accuracy_score(y, y_pred)

In [186]:
rnn = SimpleRNN2(100,50,SimpleInitializer(), SGDrnn(lr=0.1), ReLU())
fc = FC(50, 1,SimpleInitializer(), SGD(lr=0.1))
sm = Softmax()

In [174]:
X_train.shape

(50, 777, 100)

In [188]:
from tqdm import tqdm

#学習
epoch = 100
for i in tqdm(range(epoch)):
    #forward
#     print(X_train.shape)
#     print(X_train)
    X = rnn.forward(X_train)
    print(X.shape)
    print(X)
    X = fc.forward(X)
#     print(X.shape)
#     print(X)
    pred = sm.forward(X)
    print(pred.shape)
    print(pred)
    y_pred = np.argmax(pred, axis=1)
    print(y_pred.shape)
    print(y_pred)
    print(str(i+1) + " :epoch")
    print("loss")
    print(_cross_entropy_loss(pred, y_train))
    print("accuracy")
    print(accuracy(y_train,y_pred))
    #back
    print(y_train.shape)
    d = sm.backward(y_pred[:, np.newaxis])
    print(d.shape)
    d = fc.backward(d)
    print(d.shape)
    d = rnn.backward(d)
    print(d.shape)

















  0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

50
777
100


















  1%|          | 1/100 [00:02<04:49,  2.92s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[0.00000000e+00 0.00000000e+00 8.87632691e-01 ... 0.00000000e+00
  4.07412433e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 6.15868117e-01 ... 0.00000000e+00
  1.38985081e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 8.86789289e-01 ... 0.00000000e+00
  3.99801046e-01 0.00000000e+00]
 ...
 [0.00000000e+00 1.24906243e-02 2.43741365e-01 ... 0.00000000e+00
  1.30872808e-01 2.26033592e-01]
 [0.00000000e+00 0.00000000e+00 6.14131501e-01 ... 0.00000000e+00
  1.38802971e-01 0.00000000e+00]
 [2.36829874e-04 2.47940042e-03 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 4.93101266e-03]]
----------------
(50, 50)
[[0.00000000e+00 0.00000000e+00 8.87632691e-01 ... 0.00000000e+00
  4.07412433e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 6.15868117e-01 ... 0.00000000e+00
  1.38985081e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 8.86789289e-01 ... 0.00000000e+00
  3.99801046e-01 0.00000000e+00]
 ...
 [0.00000000e+00 1.24906243e-02 2.43741365e-01 ... 0.00000000e+00
  1.30

















  2%|▏         | 2/100 [00:05<04:48,  2.94s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[0.00000000e+00 0.00000000e+00 3.19816310e+00 ... 0.00000000e+00
  1.20944665e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.54707261e+00 ... 0.00000000e+00
  6.60918493e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.19843237e+00 ... 0.00000000e+00
  1.20320580e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 2.37952020e-01 ... 0.00000000e+00
  1.34557118e-01 3.03624764e-01]
 [0.00000000e+00 0.00000000e+00 2.54901067e+00 ... 0.00000000e+00
  6.60672660e-01 0.00000000e+00]
 [1.62166682e-03 2.82264009e-03 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 7.02261661e-03]]
----------------
(50, 50)
[[0.00000000e+00 0.00000000e+00 3.19816310e+00 ... 0.00000000e+00
  1.20944665e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 2.54707261e+00 ... 0.00000000e+00
  6.60918493e-01 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 3.19843237e+00 ... 0.00000000e+00
  1.20320580e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 2.37952020e-01 ... 0.00000000e+00
  1.34

















  3%|▎         | 3/100 [00:08<04:48,  2.97s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
3 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


















  4%|▍         | 4/100 [00:11<04:46,  2.98s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
4 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


















  5%|▌         | 5/100 [00:14<04:43,  2.98s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
5 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


















  6%|▌         | 6/100 [00:18<04:49,  3.08s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
6 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


















  7%|▋         | 7/100 [00:21<04:49,  3.11s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
7 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


















  8%|▊         | 8/100 [00:24<04:45,  3.10s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
8 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


















  9%|▉         | 9/100 [00:27<04:46,  3.15s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
----------------
(50, 50)
[[nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 ...
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]
 [nan nan nan ... nan nan nan]]
(50, 1)
[[nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]
 [nan]]
(50,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
9 :epoch
loss
nan
accuracy
0.54
(50,)
(50, 1)
(50, 50)
(50, 777, 100)
50
777
100


KeyboardInterrupt: 

In [None]:
# # one hot
# y_train = targets[:sample]
# y_train_hot = np.zeros((len(y_train), 9))
# for i in range(len(y_train)):
#     y_train_hot[i, y_train[i]] = 1