<a href="https://colab.research.google.com/github/achanhon/AdversarialModel/blob/master/Untitled20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##quantification
L'objectif de ce notebook est d'apprendre un réseau qui pourra être utilisé en int8.

### speed up
Regardons d'abord quelle est l'accélération qu'on peut espérer.

In [1]:
import torch
import time
import torchvision
import matplotlib.pyplot as plt

In [2]:
with torch.no_grad():
    T = torch.zeros(100,5)
    for i in range(100):
        A = torch.randint(-64, 64, (2000,1, 128), dtype=torch.int8).cuda()
        B = torch.randint(-64, 64, (1,1000, 128), dtype=torch.int8).cuda()

        t0 = time.time()
        C = torch.matmul(A[:,0,:].float(),B[0].float().t()).to(dtype=torch.int8)
        tf = time.time()
        T[i][0] = tf-t0

        t0 = time.time()
        C = torch._int_mm(A[:,0,:].to(dtype=torch.int8),B[0].t().to(dtype=torch.int8)).to(dtype=torch.int8)
        tf = time.time()
        T[i][1] = tf-t0

        t0 = time.time()
        C = (A.float()*B.float()).sum(-1).to(dtype=torch.int8)
        tf = time.time()
        T[i][2] = tf-t0

        t0 = time.time()
        C = (A.half()*B.half()).sum(-1,dtype=torch.half).to(dtype=torch.int8)
        tf = time.time()
        T[i][3] = tf-t0

        t0 = time.time()
        C = (A*B).sum(-1,dtype=torch.int8)
        tf = time.time()
        T[i][4] = tf-t0


print(T.mean(0))

tensor([4.7651e-04, 7.7317e-05, 2.4205e-04, 7.0946e-05, 3.1669e-05])



on observe que le gain à espérer n'est pas du tout impressionnant - pire, la variance est très forte (ça semble dépendre du contenu de la matrice).
néanmoins, cela testons quand même.

### architecture

Le point central du réseau sera que
- si x est de dimension 8 avec ||x||_oo <= 5
- si ||w||_oo<=3
- alors |wx| <= 3 x 5 x 8 = 120
- donc activation(x)//16 est toujours un vecteur de dimension 8 avec ||x||_oo <= 5

on peut donc construire le module "MyLinear"

In [3]:
def activation(x):
    if x.dtype==torch.int8:
        return torch.clamp(x,-17,81)//16
    else:
        return torch.clamp(x,-17,81)/16

class MyLinear(torch.nn.Module):
    def __init__(self,size):
        super(MyLinear, self).__init__()
        assert size%8==0
        self.size=size
        self.data = torch.rand(1,size//8,8,8).cuda() * 6 - 3
        self.bias = torch.rand(1,size).cuda() * 6 - 3
        self.data = torch.nn.Parameter(self.data)
        self.bias = torch.nn.Parameter(self.bias)
        self.floatmode = True

    def clamp(self):
        with torch.no_grad():
            torch.clamp(self.data,-3,3)
            torch.clamp(self.bias,-3,3)

    def goINT(self):
        if self.floatmode:
            self.clamp()
            with torch.no_grad():
                tmp = torch.round(self.data.clone()).to(dtype=torch.int8)
                tmpbis = torch.round(self.bias.clone()).to(dtype=torch.int8)
                del self.data, self.bias
                self.data,self.bias = tmp,tmpbis
                self.floatmode = False

    def forward(self,x):
        assert not (self.floatmode and x.dtype == torch.int8)
        assert not (not self.floatmode and x.dtype != torch.int8)
        assert x.shape[1]== self.size

        x = x.view(x.shape[0],self.size//8,1,8)
        x = (x*self.data).sum(-1,dtype=x.dtype)
        x = activation(x.view(x.shape[0],self.size)+self.bias)
        return x

with torch.no_grad():
    layer = MyLinear(32)
    x = torch.rand(128,32).cuda()*10-5
    t0=time.time()
    y = layer(x)
    t1=time.time()-t0
    layer.goINT()
    t0=time.time()
    z = layer(x.to(dtype=torch.int8)).float()
    t2 = time.time()-t0

print((y-z).abs().sum(),t1,t2)

tensor(2195.5115, device='cuda:0') 0.025692224502563477 0.013092994689941406


bon comme on a la flemme de faire une myconvolution, ben on va faire du transformer...
(de toute façon, la différence de durée d'inférence ne motive pas à se casser les pieds !)

### baseline
construisons maintenant un petit réseau qui pourrait absorber des poids (en partie) int8

In [4]:
def channelPool(x):
    return torch.max(x[:,::2],x[:,1::2])

class SwinLike(torch.nn.Module):
    def __init__(self):
        super(SwinLike, self).__init__()
        self.proj = torch.nn.Conv2d(3,64,kernel_size=5,stride=3).cuda()

        self.m1 = MyLinear(64*10*10)
        self.m2 = MyLinear(32*10*10)
        self.m3 = MyLinear(16*10*10)
        self.m4 = MyLinear(8*10*10)
        self.m5 = MyLinear(8*10*10)

        self.final = torch.nn.Linear(800,10).cuda()
        self.floatmode = True

    def normalize(self):
        self.m1.clamp()
        self.m2.clamp()
        self.m3.clamp()
        self.m4.clamp()
        self.m5.clamp()

    def goINT(self):
        if self.floatmode:
            self.m1.goINT()
            self.m2.goINT()
            self.m3.goINT()
            self.m4.goINT()
            self.m5.goINT()
            self.floatmode = False

    def forward(self,x):
        x = activation(self.proj(x))
        if not self.floatmode:
            x = x.to(dtype=torch.int8)

        x = channelPool(self.m1(x.flatten(1)))
        x = channelPool(self.m2(x))
        x = channelPool(self.m3(x))
        x = self.m4(x)
        x = self.m5(x)

        return self.final(x.float())

In [5]:
trainset = torchvision.datasets.CIFAR10(
    root="build",
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor(),
)
trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=128, shuffle=True, num_workers=2
)

testset = torchvision.datasets.CIFAR10(
    root="build",
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor(),
)
testloader = torch.utils.data.DataLoader(
    testset, batch_size=128, shuffle=True, num_workers=2
)


Files already downloaded and verified
Files already downloaded and verified


In [6]:
net = SwinLike()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)
meanloss = []
nb,nbOK = 0,0
for i in range(60):
    print("######",i,"######")
    for x,y in trainloader:
        z = net(x.cuda())
        loss = criterion(z,y.cuda().long())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        net.normalize()

        floss = float(loss)
        meanloss.append(floss)
        _,z = z.max(1)
        good = (y.cuda()==z).float()
        nb+=good.shape[0]
        nbOK+=good.sum().cpu().numpy()
        if len(meanloss)==100:
            print(sum(meanloss)/100, nbOK/nb)
            meanloss=[]
            # print([param for param in net.parameters()])

    with torch.no_grad():
        for x,y in trainloader:
            z = net(x.cuda())
            _,z = z.max(1)
            good = (y.cuda()==z).float()
            nb+=good.shape[0]
            nbOK+=good.sum().cpu().numpy()
        print("eval :",nbOK/nb)

###### 0 ######
2.3081105279922487 0.09453125
2.307591047286987 0.095390625
2.3060428500175476 0.09908854166666667
eval : 0.09957
###### 1 ######
2.3076293110847472 0.0995432616260677
2.295934431552887 0.10269236169615277
2.2595479559898375 0.10799040646301439
2.182074670791626 0.11569880761293282
eval : 0.14946
###### 2 ######
2.113828160762787 0.150402364757988
2.0561926758289335 0.15622675542993156
2.0177841234207152 0.16281416736871665
1.9733559489250183 0.1693573850039883
eval : 0.19551333333333334
###### 3 ######
1.9472794234752655 0.19678964989982073
1.9261636424064636 0.2015866892643934
1.9133905482292175 0.20591935719148108
1.8913186490535736 0.21034295141814097
eval : 0.2284325
###### 4 ######
1.8658569514751435 0.2296469669408415
1.8625450932979584 0.2331915056731064
1.846966073513031 0.23646003793513834
1.8310892140865327 0.23969318838485987
eval : 0.252622
###### 5 ######
1.8143472981452942 0.25381010756089845
1.7987209010124205 0.2565045510644863
1.7949659717082977 0.2592

KeyboardInterrupt: ignored

In [7]:
t0 = time.time()
with torch.no_grad():
    for x,y in trainloader:
        z = net(x.cuda())
        _,z = z.max(1)
        good = (y.cuda()==z).float()
        nb+=good.shape[0]
        nbOK+=good.sum().cpu().numpy()
    print("eval :",nbOK/nb,time.time()-t0)

eval : 0.3250900011377119 8.143792867660522


In [None]:
t0 = time.time()
net.goINT()
with torch.no_grad():
    for x,y in trainloader:
        z = net(x.cuda())
        _,z = z.max(1)
        good = (y.cuda()==z).float()
        nb+=good.shape[0]
        nbOK+=good.sum().cpu().numpy()
    print("eval :",nbOK/nb,time.time()-t0)

ben c'est vraiment pas terrible :-( et c'est pas plus rapide ?????