In [1]:
from cml.dataloader import UserItemDataset
from cml import config

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
cd ..

/data/machinelearning/cml


## Sampler実験

In [5]:
import numpy as np
import torch

from torch.utils.data import DataLoader
from torch.utils.data.sampler import BatchSampler


class BalancedBatchSampler(BatchSampler):
    """
    BatchSampler - from a MNIST-like dataset, samples n_classes and within these classes samples n_samples.
    Returns batches of size n_classes * n_samples
    """

    def __init__(self, dataset, n_classes, n_samples):
        loader = DataLoader(dataset)
        self.labels_list = []
        for _, label in loader:
            self.labels_list.append(label)
        self.labels = torch.LongTensor(self.labels_list)
        self.labels_set = list(set(self.labels.numpy()))
        self.label_to_indices = {label: np.where(self.labels.numpy() == label)[0]
                                 for label in self.labels_set}
        for l in self.labels_set:
            np.random.shuffle(self.label_to_indices[l])
        self.used_label_indices_count = {label: 0 for label in self.labels_set}
        self.count = 0
        self.n_classes = n_classes
        self.n_samples = n_samples
        self.dataset = dataset
        self.batch_size = self.n_samples * self.n_classes

    def __iter__(self):
        self.count = 0
        while self.count + self.batch_size < len(self.dataset):
            classes = np.random.choice(self.labels_set, self.n_classes, replace=False)
            indices = []
            for class_ in classes:
                indices.extend(self.label_to_indices[class_][
                               self.used_label_indices_count[class_]:self.used_label_indices_count[
                                                                         class_] + self.n_samples])
                self.used_label_indices_count[class_] += self.n_samples
                if self.used_label_indices_count[class_] + self.n_samples > len(self.label_to_indices[class_]):
                    np.random.shuffle(self.label_to_indices[class_])
                    self.used_label_indices_count[class_] = 0
            self.indices = indices
            yield indices
            self.count += self.n_classes * self.n_samples

    def __len__(self):
        return len(self.dataset) // self.batch_size

In [6]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets
import numpy as np
import matplotlib.pyplot as plt

n_classes = 5
n_samples = 8

mnist_train =  torchvision.datasets.MNIST(root="mnist/mnist_train", train=True, download=True, transform=transforms.Compose([transforms.ToTensor(),]))

balanced_batch_sampler = BalancedBatchSampler(mnist_train, n_classes, n_samples)

dataloader = torch.utils.data.DataLoader(mnist_train, batch_sampler=balanced_batch_sampler)
my_testiter = iter(dataloader)
images, target = my_testiter.next()


def imshow(img):
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))

imshow(torchvision.utils.make_grid(images))

In [7]:
len(balanced_batch_sampler.indices)

40

In [8]:
images.shape, 

(torch.Size([40, 1, 28, 28]),)

In [9]:
target.shape

torch.Size([40])

## 実際にDataloader実行

In [10]:
data_set = UserItemDataset()

4676 features over tag_occurence_thres (10)


In [11]:
data_set.user_item_matrix.shape

(7947, 25975)

In [12]:
a = 3
np.random.choice(a, [2, 3])

array([[1, 1, 0],
       [1, 0, 0]])

In [13]:
np.random.permutation(np.arange(data_set.neg_data_num))
#[:self.data_num * self.neg_rate

array([138286891,  63879782,   5574163, ..., 109979572,  48128150,
       116053616])

In [14]:
data_set.user_item_matrix.sum(axis=1).getA().reshape(-1)

array([ 4, 24, 23, ..., 71, 10,  4], dtype=int64)

In [15]:
dataloader = torch.utils.data.DataLoader(data_set, batch_size=2, shuffle=True)

c = 0
for i in dataloader:
    c += 1
    print(i)
    if c > 9:
        break

tensor([[ 6213, 14595,  6213, 21965],
        [  352, 12922,   352, 16095]])
tensor([[ 5241, 11304,  5241,  9575],
        [ 7903, 15137,  7903, 17091]])
tensor([[  484, 25873,   484, 18267],
        [ 3386, 21528,  3386, 17523]])
tensor([[ 5570, 17357,  5570,  3848],
        [ 2100, 16942,  2100,  5780]])
tensor([[ 5570,  1915,  5570, 16733],
        [ 1112,   747,  1112,  4805]])
tensor([[  638,  7648,   638, 15755],
        [  366, 16771,   366,  3019]])
tensor([[ 2100, 20336,  2100, 18373],
        [  401,  7892,   401,  9078]])
tensor([[ 2248, 18187,  2248,  4703],
        [ 5730, 10211,  5730,  8813]])
tensor([[ 3479, 24888,  3479, 20243],
        [ 2441, 16771,  2441,  3291]])
tensor([[ 2775,  7239,  2775, 10307],
        [ 5374, 20571,  5374,  2218]])


## sample Dataloader

In [16]:
import torch
class MyDataset(torch.utils.data.Dataset):

    def __init__(self, data_num, transform=None):
        self.transform = transform
        self.data_num = data_num
        self.data = []
        self.label = []
        for x in range(self.data_num):
            self.data.append(x) # 0 から (data_num-1) までのリスト
            self.label.append(x%2 == 0) # 偶数ならTrue 奇数ならFalse

    def __len__(self):
        return self.data_num

    def __getitem__(self, idx):
        out_data = self.data[idx]
        out_label =  self.label[idx]

        if self.transform:
            out_data = self.transform(out_data)

        return out_data, out_label

In [17]:
class Square(object):
    def __init__(self):
        pass

    def __call__(self, sample):
        return sample ** 2
transform = Square()

In [18]:
#sample_data_set = MyDataset(10, transform=Square())
#dataloader = torch.utils.data.DataLoader(sample_data_set, batch_size=2, shuffle=True)


## Model

In [19]:
from cml.model import CML, CMLLoss

In [20]:
_mm = CML()

In [21]:
loss = CMLLoss()

In [22]:
x = _mm(i)

In [23]:
result = loss(x.reshape(2, 20 ,3))

In [24]:
_mm.zero_grad()

In [25]:
result.backward()

In [26]:
from torch import nn, optim
params = optim.Adam(_mm.parameters(),
    lr=0.0002, betas=(0.5, 0.999))

In [27]:
params.step()

In [28]:
torch.cat([_mm.user, _mm.pos, _mm.neg]).reshape(2, 20, 3)[:, :, 0]

tensor([[ 0.9952, -0.4716, -1.4834,  0.3695, -0.9039,  0.6469,  0.0485, -1.6525,
          0.8413, -1.1711, -1.2909,  0.0210, -0.1428, -0.9309, -0.3986, -0.9815,
          0.5958,  1.5354, -2.4230, -0.8629],
        [-0.0535, -0.6980, -0.5668, -1.1298,  0.0255,  0.9324,  1.8599, -0.0089,
          0.8309, -0.8600,  0.1652, -0.6625,  1.6645,  0.5967,  0.6169, -0.1157,
          0.5032,  0.5428,  0.8342, -1.1091]], grad_fn=<SelectBackward>)

In [29]:
batch_size = 2
pos = i[:, 0:batch_size]
neg = i[:]

In [30]:
i[:, 0]

tensor([2775, 5374])

In [31]:
import torch.nn as nn
class DebugModel(nn.Module):
    def __init__(self, user_size=7947, item_size=25975, embed_dim=20):
        super(CML, self).__init__()
        self.user_size = user_size
        self.item_size = item_size
        self.embed_dim = embed_dim

        self.user_embedding = nn.Embedding(user_size, embed_dim, padding_idx=0)
        self.item_embedding = nn.Embedding(item_size, embed_dim, padding_idx=0)

    
    def forward(self, data):
        #batch, user 
        self.user = self.user_embedding(pos[:,  0])
        self.pos = self.item_embedding(pos[:, 1])
        self.neg = self.item_embedding(neg[:, 3])
        # これでいいのか?
        return torch.cat([self.user, self.pos, self.neg])

In [32]:
def train():
    data_set = UserItemDataset()
    dataloader = torch.utils.data.DataLoader(data_set, batch_size=2, shuffle=True)
    model = CML()
    loss = CMLLoss()
    opt = optim.Adam(model.parameters(),
        lr=0.0002, betas=(0.5, 0.999))

    for data in dataloader:
        model.zero_grad()
        x = model(data)
        result = loss(x)
        result.backward()
        opt.step()

In [33]:
train()

4676 features over tag_occurence_thres (10)
