In [6]:
import csv
import numpy as np
import torch

In [4]:
wine_path = '../data/p1ch4/tabular-wine/winequality-white.csv'
wine_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";", skiprows=1)
wine_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [5]:
col_list = next(csv.reader(open(wine_path), delimiter=';'))
wine_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [7]:
wineq = torch.from_numpy(wine_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [10]:
data = wineq[:, :-1]
data.shape

torch.Size([4898, 11])

In [11]:
target = wineq[:,-1]
target.shape

torch.Size([4898])

In [12]:
target = wineq[:,-1].long()
target, target.shape, target.dtype

(tensor([6, 6, 6,  ..., 6, 7, 6]), torch.Size([4898]), torch.int64)

In [15]:
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(1),1.0)
target_onehot.shape, target_onehot

(torch.Size([4898, 10]),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 1., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]))

In [19]:
data_mean = torch.mean(data, dim=0)
data_mean.shape, data_mean

(torch.Size([11]),
 tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
         1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01]))

In [20]:
data_var = torch.var(data, dim=0)
data_var.shape, data_var

(torch.Size([11]),
 tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
         1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00]))

In [21]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized.shape, data_normalized

(torch.Size([4898, 11]),
 tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
          -3.4915e-01, -1.3930e+00],
         [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
           1.3422e-03, -8.2419e-01],
         [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
          -4.3677e-01, -3.3663e-01],
         ...,
         [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
          -2.6153e-01, -9.0545e-01],
         [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
          -9.6251e-01,  1.8574e+00],
         [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
          -1.4882e+00,  1.0448e+00]]))

In [23]:
bad_indexes = target <= 3
bad_indexes.shape, bad_indexes.sum(), bad_indexes.dtype

(torch.Size([4898]), tensor(20), torch.bool)

In [24]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [28]:
bad_data = data[target <=3]
mid_data = data[(target > 3) & (target <7)]
good_data = data[target >= 7]
bad_mean, bad_var = torch.mean(bad_data, dim=0), torch.var(bad_data, dim=0)
mid_mean, mid_var = torch.mean(mid_data, dim=0), torch.var(mid_data, dim=0)
good_mean, good_var = torch.mean(good_data, dim=0), torch.var(good_data, dim=0)
for i, args in enumerate(zip(col_list, bad_mean, bad_var, mid_mean, mid_var, good_mean, good_var)):
    print('{:2} {:20} {:8.2f} {:8.2f} {:8.2f} {:8.2f} {:8.2f} {:8.2f}'.format(i, *args))

 0 fixed acidity            7.60     2.97     6.89     0.73     6.73     0.59
 1 volatile acidity         0.33     0.02     0.28     0.01     0.27     0.01
 2 citric acid              0.34     0.01     0.34     0.02     0.33     0.01
 3 residual sugar           6.39    28.27     6.71    27.30     5.26    18.41
 4 chlorides                0.05     0.00     0.05     0.00     0.04     0.00
 5 free sulfur dioxide     53.33  4819.24    35.42   292.41    34.55   190.36
 6 total sulfur dioxide   170.60 11611.86   141.83  1896.91   125.25  1070.92
 7 density                  0.99     0.00     0.99     0.00     0.99     0.00
 8 pH                       3.19     0.04     3.18     0.02     3.22     0.02
 9 sulphates                0.47     0.01     0.49     0.01     0.50     0.02
10 alcohol                 10.34     1.50    10.26     1.21    11.42     1.58


In [29]:
total_sulfur_threshold = 141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [30]:
actual_indexes = target > 5

actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [31]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)