In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [None]:
class CustomDataset(Dataset):
    def __init__(self, name, root, filename, transform=None):
        self.filename = filename
        self.root = root
        self.name = name
        xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.x = torch.from_numpy(xy[:, 1:])
        self.y = torch.from_numpy(xy[:, 0])
        self.n_samples = xy.shape[0]
        self.transform = transform
        
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Root location: {self.root}\n"
                f"    Filename: {self.filename}\n"
                f"    Transform: {transform_repr}")
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples   
    
    def train_test_split(self, test_size=.2):
        return train_test_split(self.x, self.y, test_size=test_size)        

In [None]:
dataset = CustomDataset(name='wine', root='./data', filename='wine.csv')
feature, label = dataset[0]
len(dataset), feature, label

In [None]:
test_train_data = dataset.train_test_split(test_size=0.1)
X_train, X_test, y_train, y_test = test_train_data

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
train_data = CustomDataset(name='wine', root='./data', filename='wine.csv')
test_data = CustomDataset(name='wine', root='./data', filename='wine.csv')

In [None]:
#following chatgpt's suggestion to just overwrite the internal data
#scroll down for a better solution
train_data.x = X_train; train_data.y = y_train
train_data.n_samples = X_train.shape[0]

test_data.x = X_test; test_data.y = y_test
test_data.n_samples = X_test.shape[0]

In [None]:
train_data

In [None]:
test_data

In [None]:
dataloader = DataLoader(train_data, batch_size=4, shuffle=True)

In [None]:
dataiter = iter(dataloader)
data = next(dataiter)
data

In [None]:
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
total_samples, n_iterations, len(list(dataloader))

In [None]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        if (i+1) % 5 == 0:
            print('{}/{}, step {}/{}, input.shape = {}'.format(epoch+1, num_epochs, i+1,
                                                               n_iterations, inputs.shape[0]))

In [None]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [None]:
train_dataset = datasets.MNIST(root='/data', train=True, download=True,
                               transform=transforms.ToTensor())

In [None]:
train_dataset

In [None]:
dataset

In [None]:
train_dataset[0][1]

In [None]:
dataloader2 = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, name, root, filename, train, transform=None, test_size=.2):
        self.filename = filename
        self.root = root
        self.name = name
        xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.xtemp = torch.from_numpy(xy[:, 1:])
        self.ytemp = torch.from_numpy(xy[:, 0])
        self.transform = transform
        self.train = train
        self.test_size = test_size
        self.x, self.y = self.split()
        self.n_samples = self.x.shape[0]
        
        
    def split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.xtemp,
                                                            self.ytemp, 
                                                            test_size=self.test_size,
                                                            random_state=42)
        if self.train:
            return X_train, y_train
        else:
            return X_test, y_test
    
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Root location: {self.root}\n"
                f"    Filename: {self.filename}\n"
                f"    Transform: {transform_repr}")

    #this should do
    def __getitem__(self, index):
        x, y = self.x[index], self.y[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return self.n_samples   

In [None]:
#the random_state parameter will ensure that the same split is happening for both class instances. Right?
train_dataset = CustomDataset(name='wine', root='./data', filename='wine.csv', train=True, test_size=.1)
test_dataset = CustomDataset(name='wine', root='./data', filename='wine.csv', train=False, test_size=.1)

In [None]:
train_dataset

In [None]:
test_dataset

In [None]:
dataloader3 = DataLoader(train_dataset, batch_size=3, shuffle=True)

In [None]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader3):
        if (i+1) % 5 == 0:
            print('{}/{}, step {}/{}, input.shape = {}'.format(epoch+1,
                                                               num_epochs,
                                                               i+1,
                                                               n_iterations,
                                                               inputs.shape[0]))