- Tabular data of CSV, spreadsheet are non-homogeneous as different columns have different types
- Tensors however are homogeneous

https://github.com/awesomedata/awesome-public-datasets

In [2]:
# For example the wine quality dataset is loaded probably to find the quality of dataset from the columns
import csv
import numpy as np

In [3]:
wine_path = 'winequality-white.csv'
wineq_numpy = np.loadtxt(wine_path,
                         dtype=np.float32, # 32-bit floating
                         delimiter=";", # used to seperate value in each row
                         skiprows=1) # skip the first row that contains name of the columns
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [4]:
col_list = next(csv.reader(open(wine_path), delimiter=";")) # This function call retrieves the next row from the CSV reader object. Since it's called once, it reads the first row of the CSV file.

wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [5]:
import torch

wineq = torch.from_numpy(wineq_numpy)

wineq, wineq.dtype, wineq.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  0.4500,  8.8000,  6.0000],
         [ 6.3000,  0.3000,  0.3400,  ...,  0.4900,  9.5000,  6.0000],
         [ 8.1000,  0.2800,  0.4000,  ...,  0.4400, 10.1000,  6.0000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  0.4600,  9.4000,  6.0000],
         [ 5.5000,  0.2900,  0.3000,  ...,  0.3800, 12.8000,  7.0000],
         [ 6.0000,  0.2100,  0.3800,  ...,  0.3200, 11.8000,  6.0000]]),
 torch.float32,
 torch.Size([4898, 12]))

In [6]:
# Seperating the output value column from input columns. Here quality

data = wineq[:, :-1] # Select all except the last column

data, data.shape

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  3.0000,  0.4500,  8.8000],
         [ 6.3000,  0.3000,  0.3400,  ...,  3.3000,  0.4900,  9.5000],
         [ 8.1000,  0.2800,  0.4000,  ...,  3.2600,  0.4400, 10.1000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  2.9900,  0.4600,  9.4000],
         [ 5.5000,  0.2900,  0.3000,  ...,  3.3400,  0.3800, 12.8000],
         [ 6.0000,  0.2100,  0.3800,  ...,  3.2600,  0.3200, 11.8000]]),
 torch.Size([4898, 11]))

In [7]:
target = wineq[:, -1] # Only select the last column

target, target.shape

(tensor([6., 6., 6.,  ..., 6., 7., 6.]), torch.Size([4898]))

In [10]:
# Convert tensor into tensor of labels

target = target.long()
target, target.dtype # Converts from float32 to int64

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.int64)

In [12]:
data_mean = torch.mean(data, dim=0) # dimension 0 = for column
data_var = torch.var(data, dim=0)
data_mean, data_var

(tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
         1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01]),
 tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
         1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00]))

In [14]:
data_normalized = (data - data_mean)/torch.sqrt(data_var)
data_normalized, data_normalized.shape

(tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
          -3.4915e-01, -1.3930e+00],
         [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
           1.3422e-03, -8.2419e-01],
         [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
          -4.3677e-01, -3.3663e-01],
         ...,
         [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
          -2.6153e-01, -9.0545e-01],
         [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
          -9.6251e-01,  1.8574e+00],
         [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
          -1.4882e+00,  1.0448e+00]]),
 torch.Size([4898, 11]))