# Red convolucional con mecanismos atencionales

## Espaciales y de canales

In [1]:
import torch
import math
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7fbbcc08c910>

Definición de algunas variables

In [3]:
epochs=50
batch_size=100
lr=0.001

Carga de datos del dataset CIFAR10

In [4]:
import torchvision

train_data = torchvision.datasets.CIFAR10(
    root='./mnist/',
    train=True,
    transform=torchvision.transforms.ToTensor(),
    download=True,                                  
)

test_data = torchvision.datasets.CIFAR10(
    root='./mnist/',
    train=False,
    transform=torchvision.transforms.ToTensor(),
    download=True,
)

Files already downloaded and verified
Files already downloaded and verified


In [5]:
train_img, train_label = train_data.__getitem__(0)
test_img, test_label = test_data.__getitem__(0)

print(len(train_data))
print(train_img.size())

print(len(test_data))
print(test_img.size())

50000
torch.Size([3, 32, 32])
10000
torch.Size([3, 32, 32])


In [6]:
train_loader = Data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
test_loader  = Data.DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False)

In [7]:
import torch.nn.functional as F

class BasicConv(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False):
        super(BasicConv, self).__init__()
        self.out_channels = out_planes
        self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias)
        self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None
        self.relu = nn.ReLU() if relu else None

    def forward(self, x):
        x = self.conv(x)
        if self.bn is not None:
            x = self.bn(x)
        if self.relu is not None:
            x = self.relu(x)
        return x

class Flatten(nn.Module):
    def forward(self, x):
        return x.view(x.size(0), -1)

class ChannelGate(nn.Module):
    def __init__(self, in_planes, reduction_ratio=16):
        super(ChannelGate, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)
           
        self.fc = nn.Sequential(nn.Conv2d(in_planes, in_planes // reduction_ratio, 1, bias=False),
                               nn.ReLU(),
                               nn.Conv2d(in_planes // reduction_ratio, in_planes, 1, bias=False))
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc(self.avg_pool(x))
        max_out = self.fc(self.max_pool(x))
        out = avg_out + max_out
        return self.sigmoid(out) * x

def logsumexp_2d(tensor):
    tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
    s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
    outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
    return outputs

class ChannelPool(nn.Module):
    def forward(self, x):
        return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)), dim=1 )

class SpatialGate(nn.Module):
    def __init__(self):
        super(SpatialGate, self).__init__()
        kernel_size = 7
        self.compress = ChannelPool()
        self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
    def forward(self, x):
        x_compress = self.compress(x)
        x_out = self.spatial(x_compress)
        scale = F.sigmoid(x_out) # broadcasting
        return x * scale

class CBAM(nn.Module):
    def __init__(self, gate_channels, reduction_ratio=16, spatial=True, channel=True):
        super(CBAM, self).__init__()
        self.spatial=spatial
        self.channel=channel
        
        if channel:
            self.ChannelGate = ChannelGate(gate_channels, reduction_ratio=reduction_ratio)
        if spatial:
            self.SpatialGate = SpatialGate()
    def forward(self, x):
        if self.channel:
            x = self.ChannelGate(x)
        if self.spatial:
            x = self.SpatialGate(x)
        return x
    
class Convolutional(nn.Module):
    def __init__(self, in_size):
        super(Convolutional, self).__init__()
        self.conv1 = nn.Conv2d(in_size, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        return x
    
class model (nn.Module):
    def __init__(self, spatial_=True, channel_=True):
        super(model, self).__init__()
        
        self.att = channel_ or spatial_
        
        self.conv_ini = nn.Conv2d(3, 32, 1)
        
        if channel_ or spatial_:
            self.layer1 = CBAM(32, spatial=spatial_, channel=channel_, reduction_ratio=16)
        self.layer2 = Convolutional(32)
        
        self.fc = nn.Linear(400, 10)
    
    def forward(self, x):
        x = self.conv_ini(x)
        
        if self.att:
            x = self.layer1(x)
        x = self.layer2(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x
    
modelNONE = model(spatial_=False, channel_=False).cuda()
modelS = model(channel_=False).cuda()
modelC = model(spatial_=False).cuda()
modelSC = model().cuda()

In [8]:
def get_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()
    for imgs, labels in dataloader:
        output = model(imgs.cuda())
        labels = labels.cuda()
        pred = output.max(1, keepdim=True)[1] # get the index of the max logit
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += imgs.shape[0]
    model.train()
    return correct / total

In [9]:
model = modelC

optimizer = torch.optim.Adam(model.parameters())
loss_func = nn.CrossEntropyLoss().cuda()

for epoch in range(epochs):
    model.train()
    lossestr = []
    for step, (x, y) in enumerate(train_loader):
        x = torch.autograd.Variable(x.cuda())
        y = torch.autograd.Variable(y.cuda())
        
        out = model(x)

#         print(out)
#         print(out.shape)
        loss = loss_func(out, y)
        lossestr.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()                    
        optimizer.step()                   

    if epoch % 1 == 0:
        model.eval()
        with torch.no_grad():
            losseste = []
            for step, (x, y) in enumerate(test_loader):
                x = torch.autograd.Variable(x.cuda())
                y = torch.autograd.Variable(y.cuda())
                out = model(x)
                loss = loss_func(out, y)
                losseste.append(loss.item())
        model.train()
        acctr = get_accuracy(model, train_loader)
        accte = get_accuracy(model, test_loader)

        print('Epoch: ', epoch, '| train loss: %.4f' % (sum(lossestr)/len(lossestr)), 
                                '| test loss:  %.4f' % (sum(losseste)/len(losseste)),
                                '| train acc:  %.4f' % acctr,
                                '| test acc:   %.4f' % accte)

RuntimeError: Given groups=1, weight of size [6, 3, 5, 5], expected input[100, 32, 32, 32] to have 3 channels, but got 32 channels instead

In [None]:
Epoch:  0 | train loss: 1.8325 | test loss:  1.6151 | train acc:  0.4251 | test acc:   0.4238
Epoch:  1 | train loss: 1.5622 | test loss:  1.5016 | train acc:  0.4664 | test acc:   0.4625
Epoch:  2 | train loss: 1.4625 | test loss:  1.4221 | train acc:  0.4975 | test acc:   0.4903
Epoch:  3 | train loss: 1.3947 | test loss:  1.3782 | train acc:  0.5152 | test acc:   0.5101

https://github.com/Jongchan/attention-module