## 20->40, 40->80, 80->160

In [1]:
import torch

import numpy as np
import pickle
import matplotlib.pyplot as plt
import sys
import time
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical

import time

import gym

In [2]:
def ob2torch(observation):
    return torch.tensor(observation.copy().reshape(3, observation.shape[0], observation.shape[1])).float()

In [3]:
env = gym.make("Pong-v0")
observation = env.reset()
print(observation.shape)
observations = []

(210, 160, 3)


In [4]:
batch_size = 1
n_channels = 3
n_dim_x = observation.shape[0]
n_dim_y = observation.shape[1]

In [5]:
activation = {}
def get_activation(name):
    def hook(module, input, output):
        activation[name] = output.detach()
    return hook

In [6]:
class Conv2D(nn.Module):
    def __init__(self, n_out_channels_1=20, n_out_channels_2=40):
        super(Conv2D, self).__init__()
        self.conv1 = nn.Conv2d(3, n_out_channels_1, kernel_size=3)
        self.conv2 = nn.Conv2d(n_out_channels_1, n_out_channels_2, kernel_size=3)

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.conv1,
            self.conv2
        )
        return model(x)

In [7]:
class Conv2D_Chunk(nn.Module):
    def __init__(self, n_out_channels_1=20, n_out_channels_2=40):
        super(Conv2D_Chunk, self).__init__()
        self.kernel_size = 3
        self.conv1 = nn.Conv2d(3, n_out_channels_1, kernel_size=self.kernel_size) # did this to avoid padding problems when redoing indexes
        self.part_conv1 = nn.Conv2d(3, n_out_channels_1, kernel_size=self.kernel_size) # don't overwrite forward hooks
        self.conv2 = nn.Conv2d(n_out_channels_1, n_out_channels_2, kernel_size=self.kernel_size) # did this to avoid padding problems when redoing indexes
        self.part_conv2 = nn.Conv2d(n_out_channels_1, n_out_channels_2, kernel_size=self.kernel_size) # don't overwrite forward hooks
        self.activations = {}
        self.x_prev = None
        
    def forward(self, x):
        ## Get difference of frames
        if self.x_prev is not None:
            x_diff = (self.x_prev - x) #.view(1,1,n_dim_x,n_dim_y)
            out = activation['conv2']
        else:
            out = self.conv2(self.conv1(x))
            self.x_prev = x
            return out
            
        ## Get indices to redo
        redo_idx = x_diff.nonzero()
        if redo_idx.nelement() == 0:
            out = activation['conv2']
            return out

        min_idx_x = redo_idx.min(-2)[0][2].item()
        min_idx_y = redo_idx.min(-2)[0][3].item()
        max_idx_x = redo_idx.max(-2)[0][2].item()
        max_idx_y = redo_idx.max(-2)[0][3].item()
        #print(min_idx_x, min_idx_y, max_idx_x, max_idx_y)
        
        ## Fix indices on the edge since padding is not currently supported
        if min_idx_x < self.kernel_size - 1:
            min_idx_x = self.kernel_size - 1
        if min_idx_y < self.kernel_size - 1:
            min_idx_y = self.kernel_size - 1
        if max_idx_x >= n_dim_x - (self.kernel_size - 1):
            max_idx_x = n_dim_x - self.kernel_size
        if max_idx_y >= n_dim_y - (self.kernel_size - 1):
            max_idx_y = n_dim_y - self.kernel_size

        #print(min_idx_x, max_idx_x, min_idx_y, max_idx_y)
        ## Redo indices
        r_x1 = min_idx_x - (self.kernel_size - 1)
        r_x2 = max_idx_x + self.kernel_size
        r_y1 = min_idx_y - (self.kernel_size - 1)
        r_y2 = max_idx_y + self.kernel_size
        
        redo_area = self.part_conv2(self.part_conv1(x[:,:,r_x1:r_x2,r_y1:r_y2]))
        
        out[:,:,r_x1:r_x1+redo_area.shape[2],r_y1:r_y1+redo_area.shape[3]] = redo_area
        activation['conv2'][:,:,r_x1:r_x1+redo_area.shape[2],r_y1:r_y1+redo_area.shape[3]] = redo_area
            
        self.x_prev = x
        return out

In [8]:
#conv_model = Conv2D_Chunk()
#conv_model.conv1.register_forward_hook(get_activation('conv1'))

In [9]:
#y = conv_model(t.reshape(batch_size,n_channels,n_dim_x,n_dim_y))

## Compare Conv_Block with Conv2D for 1000 frames, with no backprop

#### Conv2D v.s. Chunk
 - Use the same observations to make sure results are the same
 - Lots of overhead, but still worth it if using a lot of filters (e.g. 200)

In [10]:
env = gym.make("Pong-v0")
observation = env.reset()
observations = []

In [11]:
## Gather observations
n_steps=1000
for i in range(n_steps):
    observation,_,done,_ = env.step(np.random.choice(range(env.action_space.n)))
    observations.append(observation.copy())
    if done:
        env.reset()

In [12]:
n_out_c1 = 80 
n_out_c2 = 160 
n_tests = 20
times_old = []
times_new = []

## Test old/standard version

In [13]:
for i in range(n_tests):
    conv_old = Conv2D(n_out_c1, n_out_c2)
    start = time.time()
    for i in range(n_steps):
        y = conv_old(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))        
    end = time.time()
    times_old.append(end - start)

KeyboardInterrupt: 

In [None]:
text_file = open("test_standard_conv_{}_{}_out.txt".format(n_out_c1, n_out_c2), "w")
text_file.write("Mean time: {0:.2f}\n".format(np.mean(times_old)))
for i in range(n_tests):
    text_file.write("Run {0:d} - time: {1:.2f}\n".format(i+1, times_old[i]))
text_file.close()

## Test new version

In [13]:
conv_model = Conv2D_Chunk(n_out_c1, n_out_c2)
conv_model.conv2.register_forward_hook(get_activation('conv2'))

<torch.utils.hooks.RemovableHandle at 0x7f35b710b7b8>

In [13]:
for i in range(n_tests):
    conv_model = Conv2D_Chunk(n_out_c1, n_out_c2)
    conv_model.conv2.register_forward_hook(get_activation('conv2'))
    start = time.time()
    for i in range(n_steps):
        y = conv_model(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))        
    end = time.time()
    times_new.append(end - start)

In [14]:
text_file = open("test_new_conv_{}_{}_out.txt".format(n_out_c1, n_out_c2), "w")
text_file.write("Mean time: {0:.2f}\n".format(np.mean(times_new)))
for i in range(n_tests):
    text_file.write("Run {0:d} - time: {1:.2f}\n".format(i+1, times_new[i]))
text_file.close()

## Try backprop

In [14]:
n_out_c1 = 80

In [15]:
n_out_c2 = 80

In [50]:
conv_model = Conv2D_Chunk(n_out_c1, n_out_c2)
conv_model.conv2.register_forward_hook(get_activation('conv2'))
y = conv_model(ob2torch(observations[0]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))  
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)
target = torch.tensor(np.zeros((y.shape[1], y.shape[2], y.shape[3]))).float()

In [51]:
np.random.choice(range(100,500))

458

In [74]:
results = []
n_steps=100
seed = np.random.choice(range(100,500))
start = time.time()
conv_model.x_prev = None ## Necessary to reset graph, so don't have carryover between backward passes
for i in range(n_steps):
    print('a')
    optimizer.zero_grad()
    
    y = conv_model(ob2torch(observations[i+seed]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))  
    
    loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    
    #if i != n_steps - 1:
    #    loss.backward(retain_graph=True)
    #else:
    #    loss.backward()
    results.append(y.detach().numpy().copy())

end = time.time()
print(end - start)

a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
2.2929999828338623


#### Test against old model

In [72]:
conv_model_o = Conv2D(n_out_c1, n_out_c2)
optimizer = optim.SGD(conv_model_o.parameters(), lr=1e-3, momentum=1e-3)
target = torch.tensor(np.zeros((y.shape[1], y.shape[2], y.shape[3]))).float()

In [73]:
results_o = []
n_steps=100
start = time.time()
for i in range(n_steps):
    print('a')
    optimizer.zero_grad()
    
    y = conv_model_o(ob2torch(observations[i+seed]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))  
    
    loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    
    #loss.backward()
    
    results_o.append(y.detach().numpy().copy())
    if done:
        env.reset()
end = time.time()
print(end - start)

a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
a
6.840091705322266


## Try backprop: after 1k steps
The computational graph of the reuse model is much larger than that of the base model, so if there is a target or reward signal that we can backpropagate on every step or every few steps, then training on the reuse model is worth it. Otherwise, it just eats up too much memory resource
 - At 10 steps, using the reuse model is much faster
 - At 100 steps, they are about the same
 - At 1000 steps, using the base model is about twice as fast

In [25]:
conv_model_o = Conv2D(n_out_c1, n_out_c2)
optimizer = optim.SGD(conv_model_o.parameters(), lr=1e-3, momentum=1e-3)
target = torch.tensor(np.zeros((y.shape[1], y.shape[2], y.shape[3]))).float()

In [26]:
conv_model = Conv2D_Chunk(n_out_c1, n_out_c2)
conv_model.conv2.register_forward_hook(get_activation('conv2'))

<torch.utils.hooks.RemovableHandle at 0x7f9a58b870f0>

In [27]:
n_steps=100
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)

start = time.time()
conv_model.x_prev = None ## Necessary to reset graph, so don't have carryover between backward passes
for i in range(n_steps):
    #print('a')
    #optimizer.zero_grad()
    
    y = conv_model(ob2torch(observations[i+100]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))  
    
#loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)

#loss.backward()


#results.append(y.detach().numpy().copy())

end = time.time()
print(end - start)

1.0881381034851074


In [28]:
conv_model = Conv2D(n_out_c1, n_out_c2)

#results_o = []
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)
start = time.time()
for i in range(n_steps):
    
    y = conv_model(ob2torch(observations[i+100]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))  
    
#loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)

#loss.backward()

#results_o.append(y.detach().numpy().copy())

end = time.time()
print(end - start)

3.17580246925354


## Use profiler
#### Note: profiler affects timing, so if you use time.time() it will not match those found in profile

In [83]:
conv_model = Conv2D_Chunk(n_out_c1, n_out_c2)
conv_model.conv2.register_forward_hook(get_activation('conv2'))

<torch.utils.hooks.RemovableHandle at 0x7f9a58b537f0>

In [80]:
n_steps=15
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)


conv_model.x_prev = None ## Necessary to reset graph, so don't have carryover between backward passes
start = time.time()
with torch.autograd.profiler.profile() as prof:
    for i in range(n_steps):
        y = conv_model(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))
    #loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    #loss.backward()
end = time.time()
print(end - start)

0.7746920585632324


In [85]:
n_steps=15
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)


conv_model.x_prev = None ## Necessary to reset graph, so don't have carryover between backward passes
start = time.time()

for i in range(n_steps):
    y = conv_model(ob2torch(observations[i+100]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))
    #loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    #loss.backward()
end = time.time()
print(end - start)

0.23400497436523438


In [81]:
print(prof.key_averages().table(sort_by="cpu_time_total"))

------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                             CPU time        CUDA time            Calls        CPU total       CUDA total
------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
is_floating_point                 0.307us          0.000us               48         14.715us          0.000us
expand                            1.692us          0.000us               24         40.598us          0.000us
select                            1.060us          0.000us               48         50.884us          0.000us
view                              2.441us          0.000us               24         58.594us          0.000us
reshape                           3.909us          0.000us               15         58.639us          0.000us
as_strided                        1.996us          0.000us               30         59.882us          0.000us
tensor    

In [86]:
conv_model = Conv2D(n_out_c1, n_out_c2)
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)

start = time.time()
#with torch.autograd.profiler.profile() as prof:
for i in range(n_steps):
    y = conv_model(ob2torch(observations[i+100]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))
    #loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    #loss.backward()
end = time.time()
print(end - start)

0.3894834518432617


In [66]:
print(prof.key_averages().table(sort_by="cpu_time_total"))

------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
Name                             CPU time        CUDA time            Calls        CPU total       CUDA total
------------------------  ---------------  ---------------  ---------------  ---------------  ---------------
reshape                           7.099us          0.000us               15        106.488us          0.000us
as_strided                        3.674us          0.000us               30        110.207us          0.000us
tensor                            1.974us          0.000us               60        118.458us          0.000us
_cast_Float                      92.760us          0.000us               15       1391.398us          0.000us
thnn_conv2d_forward           13127.880us          0.000us               30     393836.386us          0.000us
thnn_conv2d                   13129.270us          0.000us               30     393878.114us          0.000us
_convoluti

In [15]:
conv_model = Conv2D_Chunk(60)
#conv_model.conv1.load_state_dict(conv_old.conv1.state_dict())
#conv_model.part_conv.load_state_dict(conv_old.conv1.state_dict())
conv_model.conv1.register_forward_hook(get_activation('conv1'))

<torch.utils.hooks.RemovableHandle at 0x7f792a4de2b0>

In [16]:
results = []

start = time.time()
for i in range(n_steps):
    y = conv_model(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))    
    #results.append(y.detach().numpy().copy())
end = time.time()
print(end - start)

1.6274147033691406


In [86]:
for i in range(n_steps):
    if not np.allclose(results_o[i].detach().numpy(), results[i], atol=1e-4):
        print(i)

AttributeError: 'numpy.ndarray' object has no attribute 'detach'

## Testing with different number of filters

In [10]:
## Gather observations
n_steps=1000
for i in range(n_steps):
    observation,_,done,_ = env.step(np.random.choice(range(env.action_space.n)))
    observations.append(observation.copy())
    if done:
        env.reset()

In [14]:
n_tests_per_setting = 5
out_channels_to_test = [20, 50, 100, 200]
times_old = {}
times_new = {}

In [15]:
for oc in out_channels_to_test:
    times_old[oc] = []
    times_new[oc] = []
    for i in range(n_tests_per_setting):
        ## Time old
        conv_old = Conv2D(oc)
        
        results_o = []

        start = time.time()
        for i in range(n_steps):
            y = conv_old(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))    
            results_o.append(y)
        end = time.time()
        times_old[oc].append(end - start)
        
        ## Time new
        conv_model = Conv2D_Chunk(oc)
        conv_model.conv1.load_state_dict(conv_old.conv1.state_dict())
        conv_model.part_conv.load_state_dict(conv_old.conv1.state_dict())
        conv_model.conv1.register_forward_hook(get_activation('conv1'))
        
        results = []

        start = time.time()
        for i in range(n_steps):
            y = conv_model(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))    
            results.append(y.detach().numpy().copy())
        end = time.time()
        times_new[oc].append(end - start)
        
        ## Confirm results are the same
        for i in range(n_steps):
            if not np.allclose(results_o[i].detach().numpy(), results[i], atol=1e-4):
                print(i)
    print("----------------------------------------")
    print("Number of output channels: {}".format(oc))
    print("----------------------------------------")
    print("Mean time to completion using old method was: {0:.2f}".format(np.mean(times_old[oc])))
    print("Median time to completion using old method was: {0:.2f}".format(np.median(times_old[oc])))
    print("Standard deviation time to completion using old method was: {0:.2f}".format(np.std(times_old[oc])))
    print()
    print("Mean time to completion using new method was: {0:.2f}".format(np.mean(times_new[oc])))
    print("Median time to completion using new method was: {0:.2f}".format(np.median(times_new[oc])))
    print("Standard deviation time to completion using new method was: {0:.2f}".format(np.std(times_new[oc])))

----------------------------------------
Number of output channels: 20
----------------------------------------
Mean time to completion using old method was: 1.24
Median time to completion using old method was: 1.24
Standard deviation time to completion using old method was: 0.01

Mean time to completion using old method was: 1.12
Median time to completion using old method was: 1.12
Standard deviation time to completion using old method was: 0.00
----------------------------------------
Number of output channels: 50
----------------------------------------
Mean time to completion using old method was: 2.41
Median time to completion using old method was: 2.28
Standard deviation time to completion using old method was: 0.17

Mean time to completion using old method was: 2.25
Median time to completion using old method was: 1.99
Standard deviation time to completion using old method was: 0.38
----------------------------------------
Number of output channels: 100
--------------------------

In [None]:
for oc in out_channels_to_test:
    print("----------------------------------------")
    print("Number of output channels: {}".format(oc))
    print("----------------------------------------")
    print("Mean time to completion using old method was: {0:.2f}".format(np.mean(times_old[oc])))
    print("Median time to completion using old method was: {0:.2f}".format(np.median(times_old[oc])))
    print("Standard deviation time to completion using old method was: {0:.2f}".format(np.std(times_old[oc])))
    print()
    print("Mean time to completion using old method was: {0:.2f}".format(np.mean(times_new[oc])))
    print("Median time to completion using old method was: {0:.2f}".format(np.median(times_new[oc])))
    print("Standard deviation time to completion using old method was: {0:.2f}".std(np.mean(times_new[oc])))
    

## Try backprop? Old

In [35]:
target = torch.tensor(np.zeros((results[0].shape[1], results[0].shape[2], results[0].shape[3]))).float()
optimizer = optim.SGD(conv_old.parameters(), lr=1e-3, momentum=1e-3)

In [38]:
results_o = []
n_steps=100
start = time.time()
for i in range(n_steps):
     
    optimizer.zero_grad()
    observation,_,done,_ = env.step(np.random.choice(range(env.action_space.n)))
    t = ob2torch(observation)
    y = conv_old(t.reshape(batch_size,n_channels,n_dim_x,n_dim_y))   
    
    loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    loss.backward()
    optimizer.step()
    
    results_o.append(y)
    observations.append(observation.copy())
    
    if done:
        env.reset()
end = time.time()
print(end - start)

0.5996429920196533


## Try backprop? New

In [13]:
optimizer = optim.SGD(conv_model.parameters(), lr=1e-3, momentum=1e-3)
target = torch.tensor(np.zeros((y.shape[1], y.shape[2], y.shape[3]))).float()

In [16]:
results = []
n_steps=10
start = time.time()
for i in range(n_steps):
    print('a')
    optimizer.zero_grad()
    
    y = conv_model(ob2torch(observations[i]).reshape(batch_size,n_channels,n_dim_x,n_dim_y))  
    
    loss = F.l1_loss(y.reshape((y.shape[1], y.shape[2], y.shape[3])), target)
    #loss.backward(retain_graph=True)
    if i != n_steps - 1:
        loss.backward(retain_graph=True)
    else:
        loss.backward()
    results.append(y.detach().numpy().copy())
    if done:
        env.reset()
end = time.time()
print(end - start)
conv_model.x_prev = None ## Necessary to reset graph, so don't have carryover between backward passes


a
a
a
a
a
a
a
a
a
a
0.09183168411254883


## Testing with different number of filters - BACKPROP

In [48]:
n_tests_per_setting = 20
out_channels_to_test = [20, 50, 100, 200]

In [None]:
for i in range()