#### This notebook provides several classes which all inherit torch.utils.data.Dataset and use different means to get your data right

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [3]:
class CustomDataset(Dataset):
    def __init__(self, name, root, filename, transform=None):
        self.filename = filename
        self.root = root
        self.name = name
        xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.x = torch.from_numpy(xy[:, 1:])
        self.y = torch.from_numpy(xy[:, 0])
        self.n_samples = xy.shape[0]
        self.transform = transform
        
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Root location: {self.root}\n"
                f"    Filename: {self.filename}\n"
                f"    Transform: {transform_repr}")
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return self.n_samples   
    
    def train_test_split(self, test_size=.2):
        return train_test_split(self.x, self.y, test_size=test_size)        

In [4]:
#here we create an instance of the class above and look at x_1 and corresponding y_1
dataset = CustomDataset(name='wine', root='./data', filename='wine.csv')
feature, label = dataset[0]
len(dataset), feature, label

(178,
 tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
         3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
         1.0650e+03]),
 tensor(1.))

In [5]:
#here we are using the method train_test_split to split or data into a train and test dataset.
test_train_data = dataset.train_test_split(test_size=0.1)
X_train, X_test, y_train, y_test = test_train_data

In [6]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([160, 13]),
 torch.Size([160]),
 torch.Size([18, 13]),
 torch.Size([18]))

In [7]:
#but we need the data to be inside the class instances so we create our to instances and overwrite the internal data.
train_data = CustomDataset(name='wine', root='./data', filename='wine.csv')
test_data = CustomDataset(name='wine', root='./data', filename='wine.csv')

In [8]:
#following chatgpt's suggestion to just overwrite the internal data
#scroll down for a better solution
train_data.x = X_train; train_data.y = y_train
train_data.n_samples = X_train.shape[0]

test_data.x = X_test; test_data.y = y_test
test_data.n_samples = X_test.shape[0]

In [9]:
train_data

Dataset wine
    Number of datapoints: 160
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [10]:
test_data

Dataset wine
    Number of datapoints: 18
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [11]:
#the purpose of the class which inherits torch.utils.data.Dataset is that it now is in 
#the right format to be passed into the torch.utils.data.DataLoader function.
#This function will feed data into the neural_networks
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [12]:
dataiter = iter(dataloader)
data = next(dataiter)
data

[tensor([[1.2080e+01, 1.1300e+00, 2.5100e+00, 2.4000e+01, 7.8000e+01, 2.0000e+00,
          1.5800e+00, 4.0000e-01, 1.4000e+00, 2.2000e+00, 1.3100e+00, 2.7200e+00,
          6.3000e+02],
         [1.4100e+01, 2.0200e+00, 2.4000e+00, 1.8800e+01, 1.0300e+02, 2.7500e+00,
          2.9200e+00, 3.2000e-01, 2.3800e+00, 6.2000e+00, 1.0700e+00, 2.7500e+00,
          1.0600e+03],
         [1.4020e+01, 1.6800e+00, 2.2100e+00, 1.6000e+01, 9.6000e+01, 2.6500e+00,
          2.3300e+00, 2.6000e-01, 1.9800e+00, 4.7000e+00, 1.0400e+00, 3.5900e+00,
          1.0350e+03],
         [1.3160e+01, 2.3600e+00, 2.6700e+00, 1.8600e+01, 1.0100e+02, 2.8000e+00,
          3.2400e+00, 3.0000e-01, 2.8100e+00, 5.6800e+00, 1.0300e+00, 3.1700e+00,
          1.1850e+03]]),
 tensor([2., 1., 1., 1.])]

In [13]:
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
total_samples, n_iterations, len(list(dataloader))

(178, 45, 45)

In [14]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        if (i+1) % 5 == 0:
            print('{}/{}, step {}/{}, input.shape = {}'.format(epoch+1, num_epochs, i+1,
                                                               n_iterations, inputs.shape[0]))

1/2, step 5/45, input.shape = 4
1/2, step 10/45, input.shape = 4
1/2, step 15/45, input.shape = 4
1/2, step 20/45, input.shape = 4
1/2, step 25/45, input.shape = 4
1/2, step 30/45, input.shape = 4
1/2, step 35/45, input.shape = 4
1/2, step 40/45, input.shape = 4
1/2, step 45/45, input.shape = 2
2/2, step 5/45, input.shape = 4
2/2, step 10/45, input.shape = 4
2/2, step 15/45, input.shape = 4
2/2, step 20/45, input.shape = 4
2/2, step 25/45, input.shape = 4
2/2, step 30/45, input.shape = 4
2/2, step 35/45, input.shape = 4
2/2, step 40/45, input.shape = 4
2/2, step 45/45, input.shape = 2


In [15]:
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [16]:
train_dataset = datasets.MNIST(root='/data', train=True, download=True,
                               transform=transforms.ToTensor())

In [17]:
train_dataset

Dataset MNIST
    Number of datapoints: 60000
    Root location: /data
    Split: Train
    StandardTransform
Transform: ToTensor()

In [18]:
dataset

Dataset wine
    Number of datapoints: 178
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [19]:
train_dataset[0][1]

5

In [20]:
dataloader2 = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [21]:
#Dataset where numpy is used to read a csv file, where the first row are the labels and the first column 
#is goint go be y. It should be easy to change your y or scroll down for a class which gets a pandas df as input
class CustomDataset(Dataset):
    def __init__(self, name, root, filename, train, transform=None, test_size=.2):
        self.filename = filename
        self.root = root
        self.name = name
        xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.xtemp = torch.from_numpy(xy[:, 1:])
        self.ytemp = torch.from_numpy(xy[:, 0])
        self.transform = transform
        self.train = train
        self.test_size = test_size
        self.x, self.y = self.split()
        self.n_samples = self.x.shape[0]
        
    def split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.xtemp,
                                                            self.ytemp, 
                                                            test_size=self.test_size,
                                                            random_state=42)
        if self.train:
            return X_train, y_train
        else:
            return X_test, y_test
    
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Root location: {self.root}\n"
                f"    Filename: {self.filename}\n"
                f"    Transform: {transform_repr}")

    #this should do
    def __getitem__(self, index):
        x, y = self.x[index], self.y[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return self.n_samples   

In [22]:
#the random_state parameter will ensure that the same split is happening for both class instances. Right?
train_dataset = CustomDataset(name='wine', root='./data', filename='wine.csv', train=True, test_size=.1)
test_dataset = CustomDataset(name='wine', root='./data', filename='wine.csv', train=False, test_size=.1)

In [23]:
train_dataset

Dataset wine
    Number of datapoints: 160
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [24]:
train_dataset[0]

(tensor([1.3860e+01, 1.3500e+00, 2.2700e+00, 1.6000e+01, 9.8000e+01, 2.9800e+00,
         3.1500e+00, 2.2000e-01, 1.8500e+00, 7.2200e+00, 1.0100e+00, 3.5500e+00,
         1.0450e+03]),
 tensor(1.))

In [25]:
test_dataset

Dataset wine
    Number of datapoints: 18
    Root location: ./data
    Filename: wine.csv
    Transform: None

In [26]:
dataloader3 = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [27]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader3):
        if (i+1) % 5 == 0:
            print('{}/{}, step {}/{}, input.shape = {}'.format(epoch+1,
                                                               num_epochs,
                                                               i+1,
                                                               n_iterations,
                                                               inputs.shape[0]))

1/2, step 5/45, input.shape = 4
1/2, step 10/45, input.shape = 4
1/2, step 15/45, input.shape = 4
1/2, step 20/45, input.shape = 4
1/2, step 25/45, input.shape = 4
1/2, step 30/45, input.shape = 4
1/2, step 35/45, input.shape = 4
1/2, step 40/45, input.shape = 4
2/2, step 5/45, input.shape = 4
2/2, step 10/45, input.shape = 4
2/2, step 15/45, input.shape = 4
2/2, step 20/45, input.shape = 4
2/2, step 25/45, input.shape = 4
2/2, step 30/45, input.shape = 4
2/2, step 35/45, input.shape = 4
2/2, step 40/45, input.shape = 4


In [28]:
#now we need a transform function, because tansforming your data is a good and important practice.
normalize = lambda X: (X - X.mean(0)) / X.std(0)
normalize.__name__ = 'normalize'

In [29]:
test_dataset = CustomDataset(name='wine',
                             root='./data',
                             filename='wine.csv', 
                             train=False, test_size=.1,
                             transform=normalize)

In [30]:
test_dataset

Dataset wine
    Number of datapoints: 18
    Root location: ./data
    Filename: wine.csv
    Transform: <function <lambda> at 0x000001CFA52D8860>

In [31]:
#note that the internal data doesn't change and your transform is only applied as you iterate through the dataset
for x, y in test_dataset:
    print(x.mean().long(), x.std())

tensor(0) tensor(1.)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.)
tensor(0) tensor(1.0000)
tensor(0) tensor(1.)
tensor(0) tensor(1.)
tensor(0) tensor(1.)
tensor(0) tensor(1.)
tensor(0) tensor(1.)
tensor(0) tensor(1.)
tensor(0) tensor(1.)


In [32]:
#next up is going to be the same class where its input will be a pandas DataFrame
import pandas as pd
df = pd.read_csv('data/wine.csv')

In [33]:
df

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [34]:
#next up is the class where its input gonna be a pandas DataFrame
class CustomDataset(Dataset):
    def __init__(self, name, df, train, transform=None, test_size=.2):
        self.filename = filename
        self.root = root
        self.name = name
        #xy = np.loadtxt('{}/{}'.format(root, filename), delimiter=',', dtype=np.float32, skiprows=1)
        self.df = df
        self.xy = df.values
        self.xtemp = torch.from_numpy(xy[:, 1:])
        self.ytemp = torch.from_numpy(xy[:, 0]) 
        self.transform = transform
        self.train = train
        self.test_size = test_size
        self.x, self.y = self.split()
        self.n_samples = self.x.shape[0]
        
    def split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.xtemp,
                                                            self.ytemp, 
                                                            test_size=self.test_size,
                                                            random_state=42)
        if self.train:
            return X_train, y_train
        else:
            return X_test, y_test
    
    def __repr__(self):
        transform_repr = repr(self.transform) if self.transform else 'None'
        return (f"Dataset {self.name}\n"
                f"    Number of datapoints: {self.n_samples}\n"
                f"    Transform: {transform_repr}")

    #this should do
    def __getitem__(self, index):
        x, y = self.x[index], self.y[index]
        if self.transform:
            x = self.transform(x)
        return x, y

    def __len__(self):
        return self.n_samples   