In [19]:
import torch
from torch import nn

# Example of *inconsistency* in the behavior between `reshape` and `view`

Provided by Andrea Gasparin.

We have a 2x3 matrix and we wish to reshape it into a size 6 vector

In [20]:
z = torch.tensor([[1,2,3],[4,5,6]])
y = z.t()
y.size()
y.view(6) # I wish to reshape the matrix z in a vector of size 6 - this gives an error

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [21]:
z = torch.tensor([[1,2,3],[4,5,6]])
y = z.t()
y.size()
yy = y.reshape(6) # this instead is safe
print(yy)

tensor([1, 4, 2, 5, 3, 6])


The reason for the error in the first cell lies in the way PyTorch stores the tensor in memory. More in detail, `view` expects the memory to be contiguous, but the transposition `.t()` caused it to be non-contiguous.
This is because `.t()` itself is not modifying the underlying memory.

A detailed, yet easy-to-grasp explanation is given [here](https://discuss.pytorch.org/t/contigious-vs-non-contigious-tensor/30107/2) by Piotr Bialecki, one of the top PyTorch developers.

Let's go more in-depth in the memory management:

In [25]:
# the memory of yy is different from the one of y and z
yy[2] = 100
print(yy)
print(y)
print(z)

tensor([  1,   4, 100,   5,   3,   6])
tensor([[ -4, 100],
        [  2,   5],
        [  3,   6]])
tensor([[ -4,   2,   3],
        [100,   5,   6]])


In [23]:
# but the memory of y and z is the same
y[0,1] = 100
print(y)
print(z)

tensor([[  1, 100],
        [  2,   5],
        [  3,   6]])
tensor([[  1,   2,   3],
        [100,   5,   6]])


In [24]:
# also using .T we get the same result
zz = y.T
zz[0,0] = -4
print(zz)
print(y)

tensor([[ -4,   2,   3],
        [100,   5,   6]])
tensor([[ -4, 100],
        [  2,   5],
        [  3,   6]])


In [26]:
# if we want a deep copy, we need to call .clone() to copy the tensor (NB: .copy() for ndarrays)
zz = y.T.clone()
zz[1,1] = 36
print(zz)
print("the update we made on zz does not propagate on to y")
print(y)

tensor([[ -4,   2,   3],
        [100,  36,   6]])
the update we made on zz does not propagate on to y
tensor([[ -4, 100],
        [  2,   5],
        [  3,   6]])


# Concerning the behavior of `torch.manual_seed()`

Suppose we have two MLPs with identical structure and we wish to ensure they have the same parameters at initialization.

In [27]:
class NN1(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(2, 5)
    def forward(self, X):
        return self.layer(X)

class NN2(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(2, 5)
    def forward(self, X):
        return self.layer(X)

In [28]:
torch.manual_seed(123) # fix RNG
nn1 = NN1()
nn2 = NN2()

Let us print the weights of the layer

In [29]:
print(nn1.state_dict()["layer.weight"])
print(nn2.state_dict()["layer.weight"])

tensor([[-0.2883,  0.0234],
        [-0.3512,  0.2667],
        [-0.6025,  0.5183],
        [-0.5140, -0.5622],
        [-0.4468,  0.3202]])
tensor([[-0.1390, -0.5394],
        [ 0.4630, -0.1668],
        [ 0.2270,  0.5000],
        [ 0.1317,  0.1934],
        [ 0.6825, -0.3189]])


we see that they're different even if we fixed the seed above

They're different even if we instantiate the same class twice:

In [30]:
torch.manual_seed(123) # fix RNG
nn1 = NN1()
nn1_copy = NN1()

In [31]:
print(nn1.state_dict()["layer.weight"])
print(nn1_copy.state_dict()["layer.weight"])

tensor([[-0.2883,  0.0234],
        [-0.3512,  0.2667],
        [-0.6025,  0.5183],
        [-0.5140, -0.5622],
        [-0.4468,  0.3202]])
tensor([[-0.1390, -0.5394],
        [ 0.4630, -0.1668],
        [ 0.2270,  0.5000],
        [ 0.1317,  0.1934],
        [ 0.6825, -0.3189]])


If we want to ensure equal initialization between the two classes, we need to fix the seed _inside_ the class constructor:

In [32]:
class NN1(nn.Module):
    def __init__(self):
        torch.manual_seed(1)
        super().__init__()
        self.layer = nn.Linear(2, 5)
    def forward(self, X):
        return self.layer(X)

class NN2(nn.Module):
    def __init__(self):
        torch.manual_seed(1)
        super().__init__()
        self.layer = nn.Linear(2, 5)
    def forward(self, X):
        return self.layer(X)

nn1 = NN1()
nn2 = NN2()

print(nn1.state_dict()["layer.weight"])
print(nn2.state_dict()["layer.weight"])

tensor([[ 0.3643, -0.3121],
        [-0.1371,  0.3319],
        [-0.6657,  0.4241],
        [-0.1455,  0.3597],
        [ 0.0983, -0.0866]])
tensor([[ 0.3643, -0.3121],
        [-0.1371,  0.3319],
        [-0.6657,  0.4241],
        [-0.1455,  0.3597],
        [ 0.0983, -0.0866]])


now they're the same.

They are the same even if they have structural differences, but the first set of parameters (weights of the first linear layer) have the same size.

For instance, let us create and instantiate a model which is NN1, but its linear layer have no bias:

In [33]:
class NN1_nobias(nn.Module):
    def __init__(self):
        torch.manual_seed(1)
        super().__init__()
        self.layer = nn.Linear(2, 5, bias=False)
    def forward(self, X):
        return self.layer(X)

nn1 = NN1()
nn2 = NN1_nobias()

print(nn1.state_dict()["layer.weight"])
print(nn2.state_dict()["layer.weight"])

tensor([[ 0.3643, -0.3121],
        [-0.1371,  0.3319],
        [-0.6657,  0.4241],
        [-0.1455,  0.3597],
        [ 0.0983, -0.0866]])
tensor([[ 0.3643, -0.3121],
        [-0.1371,  0.3319],
        [-0.6657,  0.4241],
        [-0.1455,  0.3597],
        [ 0.0983, -0.0866]])


They're the same again.

If we wish to opt for an _elegant_ solution which lets us force deterministic initialization **only when we want it**, we can do something like this.

In [34]:
class NN1(nn.Module):
    def __init__(self, manual_seed=None):
        # if the user passes a manual seed, we set it, otherwise we don't
        if manual_seed is not None:
            torch.manual_seed(manual_seed)

        super().__init__()
        self.layer = nn.Linear(2, 5)
    def forward(self, X):
        return self.layer(X)