# 52. GPU 지원

- 병렬 계산에는 GPU가 훨씬 뛰어나므로 이번 단계에서는 GPU에서 구동하기 위한 구조를 만들 것임

## 52.1 쿠파이 설치 및 사용 방법

- 쿠파이는 GPU를 활용하여 병렬 계산을 해주는 라이브러리
- \$ pip install cupy
- DeZero에서 넘파이를 사용하는 부분을 쿠파이로 바꾸면 됨
    - ```import numpy as np```
    - ```import cupy as cp```

## 52.2 쿠다 모듈

- 쿠파이 관련 함수는 cuda.py에 모아둠

In [None]:
import numpy as np
gpu_enable = True
try:
    import cupy as cp
    cupy = cp
except ImportError:
    gpu_enable = False
from dezero import Variable

In [None]:
def get_array_module(x):
    """Returns the array module for `x`.
    Args:
        x (dezero.Variable or numpy.ndarray or cupy.ndarray): Values to
            determine whether NumPy or CuPy should be used.
    Returns:
        module: `cupy` or `numpy` is returned based on the argument.
    """
    if isinstance(x, Variable):
        x = x.data

    if not gpu_enable:
        return np
    xp = cp.get_array_module(x)
    return xp


def as_numpy(x):
    """Convert to `numpy.ndarray`.
    Args:
        x (`numpy.ndarray` or `cupy.ndarray`): Arbitrary object that can be
            converted to `numpy.ndarray`.
    Returns:
        `numpy.ndarray`: Converted array.
    """
    if isinstance(x, Variable):
        x = x.data

    if np.isscalar(x):
        return np.array(x)
    elif isinstance(x, np.ndarray):
        return x
    return cp.asnumpy(x)


def as_cupy(x):
    """Convert to `cupy.ndarray`.
    Args:
        x (`numpy.ndarray` or `cupy.ndarray`): Arbitrary object that can be
            converted to `cupy.ndarray`.
    Returns:
        `cupy.ndarray`: Converted array.
    """
    if isinstance(x, Variable):
        x = x.data

    if not gpu_enable:
        raise Exception('CuPy cannot be loaded. Install CuPy!')
    return cp.asarray(x)

## 52.3 Variable/Layer/DataLoader 클래스 추가 구현

- DeZero의 다른 클래스들에 GPU 대응 기능을 추가함

In [None]:
...
try:
    import cupy
    array_types = (np.ndarray, cupy.ndarray)
except ImportError:
    array_types = (np.ndarray)  # (1)


class Variable:
    __array_priority__ = 200

    def __init__(self, data, name=None):
        if data is not None:
            if not isinstance(data, array_types):  # (1)
                raise TypeError('{} is not supported'.format(type(data)))
...
    def backward(self, retain_grad=False, create_graph=False):
        if self.grad is None:
            xp = dezero.cuda.get_array_module(self.data)  # (2)
            self.grad = Variable(xp.ones_like(self.data))

In [None]:
class Variable:
    ...
    def to_cpu(self):
        if self.data is not None:
            self.data = dezero.cuda.as_numpy(self.data)

    def to_gpu(self):
        if self.data is not None:
            self.data = dezero.cuda.as_cupy(self.data)

In [None]:
class Layer:
    ...
    def to_cpu(self):
        for param in self.params():
            param.to_cpu()

    def to_gpu(self):
        for param in self.params():
            param.to_gpu()

In [None]:
...
import numpy as np
from dezero import cuda

class DataLoader:
    def __init__(self, dataset, batch_size, shuffle=True, gpu=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.data_size = len(dataset)
        self.max_iter = math.ceil(self.data_size / batch_size)
        self.gpu = gpu

        self.reset()
        
    def __next__(self):
        if self.iteration >= self.max_iter:
            self.reset()
            raise StopIteration

        i, batch_size = self.iteration, self.batch_size
        batch_index = self.index[i * batch_size:(i + 1) * batch_size]
        batch = [self.dataset[i] for i in batch_index]

        xp = cuda.cupy if self.gpu else np
        x = xp.array([example[0] for example in batch])
        t = xp.array([example[1] for example in batch])

        self.iteration += 1
        return x, t

    def to_cpu(self):
        self.gpu = False

    def to_gpu(self):
        self.gpu = True

## 52.4 함수 추가 구현

- GPU 대응과 관련하여 함수를 수정함

In [None]:
from dezero import cuda

class Sin(Function):
    def forward(self, x):
        xp = cuda.get_array_module(x)
        y = xp.sin(x)
        return y

    def backward(self, gy):
        x, = self.inputs
        gx = gy * cos(x)
        return gx

In [None]:
def as_array(x, array_module=np):
    if np.isscalar(x):
        return array_module.array(x)
    return x

def add(x0, x1):
    x1 = as_array(x1, dezero.cuda.get_array_module(x0.data))
    return Add()(x0, x1)

def mul(x0, x1):
    x1 = as_array(x1, dezero.cuda.get_array_module(x0.data))
    return Mul()(x0, x1)

## 52.5 GPU로 MNIST 학습하기

- MINST 학습 코드를 GPU에서 실행해 봅시다. 

In [None]:
if '__file__' in globals():
    import os, sys
    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import time
import dezero
import dezero.functions as F
from dezero import optimizers
from dezero import DataLoader
from dezero.models import MLP


max_epoch = 5
batch_size = 100

train_set = dezero.datasets.MNIST(train=True)
train_loader = DataLoader(train_set, batch_size)
model = MLP((1000, 10))
optimizer = optimizers.SGD().setup(model)

# GPU mode
if dezero.cuda.gpu_enable:
    train_loader.to_gpu()
    model.to_gpu()

for epoch in range(max_epoch):
    start = time.time()
    sum_loss = 0

    for x, t in train_loader:
        y = model(x)
        loss = F.softmax_cross_entropy(y, t)
        model.cleargrads()
        loss.backward()
        optimizer.update()
        sum_loss += float(loss.data) * len(t)

    elapsed_time = time.time() - start
    print('epoch: {}, loss: {:.4f}, time: {:.4f}[sec]'.format(
        epoch + 1, sum_loss / len(train_set), elapsed_time))




<img src="image/그림52-1.png" width="50%" height="50%"></img>  

# 53. 모델 저장 및 읽어오기
- 

## 53.1 넘파이의 save 함수와 load 함수
- 

## 53.2 Layer 클래스의 매개변수를 평형하게
- 

<img src="image/그림53-1.png" width="50%" height="50%"></img>  

## 53.3 Layer 클래스의 save 함수와 load 함수
- 

In [None]:
if '__file__' in globals():
    import os, sys
    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import dezero
import dezero.functions as F
from dezero import optimizers
from dezero import DataLoader
from dezero.models import MLP


max_epoch = 3
batch_size = 100

train_set = dezero.datasets.MNIST(train=True)
train_loader = DataLoader(train_set, batch_size)
model = MLP((1000, 10))
optimizer = optimizers.SGD().setup(model)

if os.path.exists('my_mlp.npz'):
    model.load_weights('my_mlp.npz')

for epoch in range(max_epoch):
    sum_loss = 0

    for x, t in train_loader:
        y = model(x)
        loss = F.softmax_cross_entropy(y, t)
        model.cleargrads()
        loss.backward()
        optimizer.update()
        sum_loss += float(loss.data) * len(t)

    print('epoch: {}, loss: {:.4f}'.format(
        epoch + 1, sum_loss / len(train_set)))

model.save_weights('my_mlp.npz')


# 54. 드롭아웃과 테스트 모드
- 

## 54.1  드롭아웃이란
- 


<img src="image/그림54-1.png" width="50%" height="50%"></img>  

## 54.2 역 드롭아웃
- 

## 54.3 테스트 모드 추가
- 

## 54.4 드롭아웃 구현
- 

In [None]:
if '__file__' in globals():
    import os, sys
    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import numpy as np
from dezero import test_mode
import dezero.functions as F

x = np.ones(5)
print(x)

# When training
y = F.dropout(x)
print(y)

# When testing (predicting)
with test_mode():
    y = F.dropout(x)
    print(y)

# 55. CNN 메커니즘(1)
- 

## 55.1 CNN 신경망의 구조
- 

<img src="image/그림55-1.png" width="50%" height="50%"></img>  

## 55.2 합성곱 연산
- 


<img src="image/그림55-2.png" width="50%" height="50%"></img>  

<img src="image/그림55-3.png" width="50%" height="50%"></img>  

<img src="image/그림55-4.png" width="50%" height="50%"></img>  

## 55.3 패딩
- 


<img src="image/그림55-5.png" width="50%" height="50%"></img>  

## 55.4 스트라이드
- 


<img src="image/그림55-6.png" width="50%" height="50%"></img>  

## 55.5 출력 크기 계산 방법
- 

In [None]:
def get_conv_outsize(input_size, kernel_size, stride, pad):
    return (input_size + pad * 2 - kernel_size) // stride + 1


H, W = 4, 4  # Input size
KH, KW = 3, 3  # Kernel size
SH, SW = 1, 1  # Kernel stride
PH, PW = 1, 1  # Padding size

OH = get_conv_outsize(H, KH, SH, PH)
OW = get_conv_outsize(W, KW, SW, PW)
print(OH, OW)


# 56. CNN 메커니즘(2)
- 

## 56.1 3차원 텐서
- 

<img src="image/그림56-1.png" width="50%" height="50%"></img>  

## 56.2 블록으로 생각하기
- 

<img src="image/그림56-2.png" width="50%" height="50%"></img>  

<img src="image/그림56-3.png" width="50%" height="50%"></img>  

<img src="image/그림56-4.png" width="50%" height="50%"></img>  

## 56.3 미니배치 처리
- 

<img src="image/그림56-5.png" width="50%" height="50%"></img>  

## 56.4 풀링 층

- 

<img src="image/그림56-6.png" width="50%" height="50%"></img>  

<img src="image/그림56-7.png" width="50%" height="50%"></img>  

<img src="image/그림56-8.png" width="50%" height="50%"></img>  

# 57. conv2d 함수와 pooling 함수

- 

## 57.1 im2col에 의한 전개
- 

<img src="image/그림57-1.png" width="50%" height="50%"></img>  

<img src="image/그림57-2.png" width="50%" height="50%"></img>  

## 57.2 conv2d 함수 구현

- 

<img src="image/표57-1.png" width="50%" height="50%"></img>  

<img src="image/그림57-3.png" width="50%" height="50%"></img>  

## 57.3 Conv2d 계층 구현
- 

<img src="image/표57-2.png" width="50%" height="50%"></img>  

## 57.4 pooling 함수 구현

- 

<img src="image/그림57-4.png" width="50%" height="50%"></img>  

<img src="image/그림57-5.png" width="50%" height="50%"></img>  

In [None]:
if '__file__' in globals():
    import os, sys
    sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import numpy as np
from dezero import Variable
import dezero.functions as F


# im2col
x1 = np.random.rand(1, 3, 7, 7)
col1 = F.im2col(x1, kernel_size=5, stride=1, pad=0, to_matrix=True)
print(col1.shape)  # (9, 75)

x2 = np.random.rand(10, 3, 7, 7)  # 10個のデータ
kernel_size = (5, 5)
stride = (1, 1)
pad = (0, 0)
col2 = F.im2col(x2, kernel_size, stride, pad, to_matrix=True)
print(col2.shape)  # (90, 75)


# conv2d
N, C, H, W = 1, 5, 15, 15
OC, (KH, KW) = 8, (3, 3)
x = Variable(np.random.randn(N, C, H, W))
W = np.random.randn(OC, C, KH, KW)
y = F.conv2d_simple(x, W, b=None, stride=1, pad=1)
y.backward()
print(y.shape)  # (1, 8, 15, 15)
print(x.grad.shape)  # (1, 5, 15, 15)
