In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [8]:
class CustomDataset(Dataset):
    def __init__(self, name, root, filename, transform=None):
        self.filename = filename
        self.root = root
        self.name = name
        xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.x = torch.from_numpy(xy[:, 1:])
        self.y = torch.from_numpy(xy[:, 0])
        self.n_samples = xy.shape[0]
        self.transform = transform
        
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Root location: {self.root}\n"
                f"    Filename: {self.filename}\n"
                f"    Transform: {transform_repr}")
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples   
    
    def train_test_split(self, test_size=.2):
        return train_test_split(self.x, self.y, test_size=test_size)        

In [9]:
dataset = CustomDataset(name='wine', root='./data', filename='wine.csv')
feature, label = dataset[0]
len(dataset), feature, label

(178,
 tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
         3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
         1.0650e+03]),
 tensor(1.))

In [10]:
test_train_data = dataset.train_test_split(test_size=0.1)
X_train, X_test, y_train, y_test = test_train_data

In [11]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([160, 13]),
 torch.Size([160]),
 torch.Size([18, 13]),
 torch.Size([18]))

In [12]:
train_data = CustomDataset(name='wine', root='./data', filename='wine.csv')
test_data = CustomDataset(name='wine', root='./data', filename='wine.csv')

In [13]:
#following chatgpt's suggestion to just overwrite the internal data
#scroll down for a better solution
train_data.x = X_train; train_data.y = y_train
train_data.n_samples = X_train.shape[0]

test_data.x = X_test; test_data.y = y_test
test_data.n_samples = X_test.shape[0]

In [14]:
train_data

Dataset wine
    Number of datapoints: 160
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [15]:
test_data

Dataset wine
    Number of datapoints: 18
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [16]:
dataloader = DataLoader(train_data, batch_size=4, shuffle=True)

In [17]:
dataiter = iter(dataloader)
data = next(dataiter)
data

[tensor([[1.3780e+01, 2.7600e+00, 2.3000e+00, 2.2000e+01, 9.0000e+01, 1.3500e+00,
          6.8000e-01, 4.1000e-01, 1.0300e+00, 9.5800e+00, 7.0000e-01, 1.6800e+00,
          6.1500e+02],
         [1.2470e+01, 1.5200e+00, 2.2000e+00, 1.9000e+01, 1.6200e+02, 2.5000e+00,
          2.2700e+00, 3.2000e-01, 3.2800e+00, 2.6000e+00, 1.1600e+00, 2.6300e+00,
          9.3700e+02],
         [1.2600e+01, 2.4600e+00, 2.2000e+00, 1.8500e+01, 9.4000e+01, 1.6200e+00,
          6.6000e-01, 6.3000e-01, 9.4000e-01, 7.1000e+00, 7.3000e-01, 1.5800e+00,
          6.9500e+02],
         [1.2040e+01, 4.3000e+00, 2.3800e+00, 2.2000e+01, 8.0000e+01, 2.1000e+00,
          1.7500e+00, 4.2000e-01, 1.3500e+00, 2.6000e+00, 7.9000e-01, 2.5700e+00,
          5.8000e+02]]),
 tensor([3., 2., 3., 2.])]

In [18]:
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
total_samples, n_iterations, len(list(dataloader))

(178, 45, 40)

In [19]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        if (i+1) % 5 == 0:
            print('{}/{}, step {}/{}, input.shape = {}'.format(epoch+1, num_epochs, i+1,
                                                               n_iterations, inputs.shape[0]))

1/2, step 5/45, input.shape = 4
1/2, step 10/45, input.shape = 4
1/2, step 15/45, input.shape = 4
1/2, step 20/45, input.shape = 4
1/2, step 25/45, input.shape = 4
1/2, step 30/45, input.shape = 4
1/2, step 35/45, input.shape = 4
1/2, step 40/45, input.shape = 4
2/2, step 5/45, input.shape = 4
2/2, step 10/45, input.shape = 4
2/2, step 15/45, input.shape = 4
2/2, step 20/45, input.shape = 4
2/2, step 25/45, input.shape = 4
2/2, step 30/45, input.shape = 4
2/2, step 35/45, input.shape = 4
2/2, step 40/45, input.shape = 4


In [20]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [21]:
train_dataset = datasets.MNIST(root='/data', train=True, download=True,
                               transform=transforms.ToTensor())

In [22]:
train_dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: /data
    Split: Train
    StandardTransform
Transform: ToTensor()

In [23]:
dataset

Dataset wine
    Number of datapoints: 178
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [24]:
train_dataset[0][1]

5

In [25]:
dataloader2 = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [26]:
class CustomDataset(Dataset):
    def __init__(self, name, root, filename, train, transform=None, test_size=.2):
        self.filename = filename
        self.root = root
        self.name = name
        xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.xtemp = torch.from_numpy(xy[:, 1:])
        self.ytemp = torch.from_numpy(xy[:, 0])
        self.transform = transform
        self.train = train
        self.test_size = test_size
        self.x, self.y = self.split()
        self.n_samples = self.x.shape[0]
        
        
    def split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.xtemp,
                                                            self.ytemp, 
                                                            test_size=self.test_size,
                                                            random_state=42)
        if self.train:
            return X_train, y_train
        else:
            return X_test, y_test
    
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Root location: {self.root}\n"
                f"    Filename: {self.filename}\n"
                f"    Transform: {transform_repr}")

    #this should do
    def __getitem__(self, index):
        x, y = self.x[index], self.y[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return self.n_samples   

In [27]:
#the random_state parameter will ensure that the same split is happening for both class instances. Right?
train_dataset = CustomDataset(name='wine', root='./data', filename='wine.csv', train=True, test_size=.1)
test_dataset = CustomDataset(name='wine', root='./data', filename='wine.csv', train=False, test_size=.1)

In [28]:
train_dataset

Dataset wine
    Number of datapoints: 160
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [29]:
test_dataset

Dataset wine
    Number of datapoints: 18
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [30]:
dataloader3 = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [31]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader3):
        if (i+1) % 5 == 0:
            print('{}/{}, step {}/{}, input.shape = {}'.format(epoch+1,
                                                               num_epochs,
                                                               i+1,
                                                               n_iterations,
                                                               inputs.shape[0]))

1/2, step 5/45, input.shape = 4
1/2, step 10/45, input.shape = 4
1/2, step 15/45, input.shape = 4
1/2, step 20/45, input.shape = 4
1/2, step 25/45, input.shape = 4
1/2, step 30/45, input.shape = 4
1/2, step 35/45, input.shape = 4
1/2, step 40/45, input.shape = 4
2/2, step 5/45, input.shape = 4
2/2, step 10/45, input.shape = 4
2/2, step 15/45, input.shape = 4
2/2, step 20/45, input.shape = 4
2/2, step 25/45, input.shape = 4
2/2, step 30/45, input.shape = 4
2/2, step 35/45, input.shape = 4
2/2, step 40/45, input.shape = 4
