<a href="https://colab.research.google.com/github/aakhterov/ML_tools/blob/master/hugging_face_datasets_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [98]:
!pip install datasets



In [2]:
!wget https://archive.ics.uci.edu/static/public/109/wine.zip
!unzip wine.zip

--2023-11-15 14:47:35--  https://archive.ics.uci.edu/static/public/109/wine.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘wine.zip’

wine.zip                [ <=>                ]   5.90K  --.-KB/s    in 0s      

2023-11-15 14:47:36 (89.2 MB/s) - ‘wine.zip’ saved [6038]

Archive:  wine.zip
  inflating: Index                   
  inflating: wine.data               
  inflating: wine.names              


In [19]:
from datasets import load_dataset
import numpy as np
from sklearn.preprocessing import StandardScaler

In [28]:
feature_names = ["class", "alcohol", "malic acid", "ash", "alcalinity of ash", "magnesium", "total phenols", "flavanoids",
                 "nonflavanoid phenols", "proanthocyanins", "color intensity", "hue", "OD280/OD315 of diluted wines", "proline"]

raw_ds = load_dataset('csv', data_files='wine.data', delimiter=',', names=feature_names)
raw_ds

DatasetDict({
    train: Dataset({
        features: ['class', 'alcohol', 'malic acid', 'ash', 'alcalinity of ash', 'magnesium', 'total phenols', 'flavanoids', 'nonflavanoid phenols', 'proanthocyanins', 'color intensity', 'hue', 'OD280/OD315 of diluted wines', 'proline'],
        num_rows: 178
    })
})

In [29]:
ds = raw_ds["train"].train_test_split(train_size=0.8)
tmp_ds = ds["test"].train_test_split(train_size=0.8)
ds["validation"] = tmp_ds["test"]
ds["test"] = tmp_ds["train"]
ds

DatasetDict({
    train: Dataset({
        features: ['class', 'alcohol', 'malic acid', 'ash', 'alcalinity of ash', 'magnesium', 'total phenols', 'flavanoids', 'nonflavanoid phenols', 'proanthocyanins', 'color intensity', 'hue', 'OD280/OD315 of diluted wines', 'proline'],
        num_rows: 142
    })
    test: Dataset({
        features: ['class', 'alcohol', 'malic acid', 'ash', 'alcalinity of ash', 'magnesium', 'total phenols', 'flavanoids', 'nonflavanoid phenols', 'proanthocyanins', 'color intensity', 'hue', 'OD280/OD315 of diluted wines', 'proline'],
        num_rows: 28
    })
    validation: Dataset({
        features: ['class', 'alcohol', 'malic acid', 'ash', 'alcalinity of ash', 'magnesium', 'total phenols', 'flavanoids', 'nonflavanoid phenols', 'proanthocyanins', 'color intensity', 'hue', 'OD280/OD315 of diluted wines', 'proline'],
        num_rows: 8
    })
})

In [30]:
ds["train"][:3]

{'class': [1, 2, 2],
 'alcohol': [14.19, 12.29, 12.37],
 'malic acid': [1.59, 2.83, 1.63],
 'ash': [2.48, 2.22, 2.3],
 'alcalinity of ash': [16.5, 18.0, 24.5],
 'magnesium': [108, 88, 88],
 'total phenols': [3.3, 2.45, 2.22],
 'flavanoids': [3.93, 2.25, 2.45],
 'nonflavanoid phenols': [0.32, 0.25, 0.4],
 'proanthocyanins': [1.86, 1.99, 1.9],
 'color intensity': [8.7, 2.15, 2.12],
 'hue': [1.23, 1.15, 0.89],
 'OD280/OD315 of diluted wines': [2.82, 3.3, 2.78],
 'proline': [1680, 290, 342]}

In [31]:
ds = ds.class_encode_column('class')

Stringifying the column:   0%|          | 0/142 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/142 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/28 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/28 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/8 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/8 [00:00<?, ? examples/s]

In [32]:
ds['train'].features

{'class': ClassLabel(names=['1', '2', '3'], id=None),
 'alcohol': Value(dtype='float64', id=None),
 'malic acid': Value(dtype='float64', id=None),
 'ash': Value(dtype='float64', id=None),
 'alcalinity of ash': Value(dtype='float64', id=None),
 'magnesium': Value(dtype='int64', id=None),
 'total phenols': Value(dtype='float64', id=None),
 'flavanoids': Value(dtype='float64', id=None),
 'nonflavanoid phenols': Value(dtype='float64', id=None),
 'proanthocyanins': Value(dtype='float64', id=None),
 'color intensity': Value(dtype='float64', id=None),
 'hue': Value(dtype='float64', id=None),
 'OD280/OD315 of diluted wines': Value(dtype='float64', id=None),
 'proline': Value(dtype='int64', id=None)}

In [94]:
def get_stats(dataset):
  for feature in dataset.column_names:
    if feature == 'class':
      continue
    print(f"Feature - {feature}. mean={np.mean(dataset[feature]):.2f}. std={np.std(dataset[feature]):.2f}")

get_stats(ds['train'])

Feature - alcohol. mean=12.98. std=0.80
Feature - malic acid. mean=2.35. std=1.11
Feature - ash. mean=2.36. std=0.27
Feature - alcalinity of ash. mean=19.48. std=3.26
Feature - magnesium. mean=99.39. std=14.17
Feature - total phenols. mean=2.30. std=0.63
Feature - flavanoids. mean=2.02. std=0.97
Feature - nonflavanoid phenols. mean=0.36. std=0.12
Feature - proanthocyanins. mean=1.60. std=0.58
Feature - color intensity. mean=5.02. std=2.37
Feature - hue. mean=0.95. std=0.23
Feature - OD280/OD315 of diluted wines. mean=2.61. std=0.70
Feature - proline. mean=736.06. std=310.40


In [78]:
scalers = {}
for feature in ds["train"].column_names:
  if feature == 'class':
    continue
  sc = StandardScaler()
  scalers[feature] = sc.fit(np.array(ds["train"][feature]).reshape(-1,1))

In [97]:
def standard_scaler(examples):
  result = {}
  for feature_name in examples.keys():
    if feature_name == 'class':
      result['class'] = examples['class']
    else:
      result[feature_name] = scalers[feature_name].transform(np.array(examples[feature_name]).reshape(-1,1)).reshape(-1,).tolist()
  return result

In [86]:
ds['train'][:3]

{'class': [0, 1, 1],
 'alcohol': [14.19, 12.29, 12.37],
 'malic acid': [1.59, 2.83, 1.63],
 'ash': [2.48, 2.22, 2.3],
 'alcalinity of ash': [16.5, 18.0, 24.5],
 'magnesium': [108, 88, 88],
 'total phenols': [3.3, 2.45, 2.22],
 'flavanoids': [3.93, 2.25, 2.45],
 'nonflavanoid phenols': [0.32, 0.25, 0.4],
 'proanthocyanins': [1.86, 1.99, 1.9],
 'color intensity': [8.7, 2.15, 2.12],
 'hue': [1.23, 1.15, 0.89],
 'OD280/OD315 of diluted wines': [2.82, 3.3, 2.78],
 'proline': [1680, 290, 342]}

In [88]:
standard_scaler(ds['train'][:3])

{'class': [0, 1, 1],
 'alcohol': [1.514147786787722, -0.8662104050041896, -0.7659847969287406],
 'malic acid': [-0.6835946104948895, 0.4372484756019553, -0.6474383819111205],
 'ash': [0.44539703878522374, -0.5179338734555782, -0.22152436199687103],
 'alcalinity of ash': [-0.9152882919629463,
  -0.45450737447746403,
  1.5422099346262923],
 'magnesium': [0.6077927541713186, -0.8035984329476872, -0.8035984329476872],
 'total phenols': [1.5955136031988448,
  0.2458850945009647,
  -0.11930850197022651],
 'flavanoids': [1.9738698130454542, 0.2373710019670769, 0.4440970509049791],
 'nonflavanoid phenols': [-0.3150249807891209,
  -0.8822982251811244,
  0.3332872985160261],
 'proanthocyanins': [0.4402105738245888,
  0.662604859341882,
  0.5086395847529864],
 'color intensity': [1.5486752164664426,
  -1.2101212595862698,
  -1.2227569686368929],
 'hue': [1.2272341086902256, 0.8749477723400723, -0.269982820797925],
 'OD280/OD315 of diluted wines': [0.29789465458118597,
  0.9803988447073604,
  0.24

In [90]:
ds_scaled = ds.map(standard_scaler, batched=True)

In [93]:
ds_scaled['train'][:3]

{'class': [0, 1, 1],
 'alcohol': [1.514147786787722, -0.8662104050041896, -0.7659847969287406],
 'malic acid': [-0.6835946104948895, 0.4372484756019553, -0.6474383819111205],
 'ash': [0.44539703878522374, -0.5179338734555782, -0.22152436199687103],
 'alcalinity of ash': [-0.9152882919629463,
  -0.45450737447746403,
  1.5422099346262923],
 'magnesium': [0.6077927541713186, -0.8035984329476872, -0.8035984329476872],
 'total phenols': [1.5955136031988448,
  0.2458850945009647,
  -0.11930850197022651],
 'flavanoids': [1.9738698130454542, 0.2373710019670769, 0.4440970509049791],
 'nonflavanoid phenols': [-0.3150249807891209,
  -0.8822982251811244,
  0.3332872985160261],
 'proanthocyanins': [0.4402105738245888,
  0.662604859341882,
  0.5086395847529864],
 'color intensity': [1.5486752164664426,
  -1.2101212595862698,
  -1.2227569686368929],
 'hue': [1.2272341086902256, 0.8749477723400723, -0.269982820797925],
 'OD280/OD315 of diluted wines': [0.29789465458118597,
  0.9803988447073604,
  0.24

In [95]:
get_stats(ds_scaled['train'])

Feature - alcohol. mean=-0.00. std=1.00
Feature - malic acid. mean=0.00. std=1.00
Feature - ash. mean=-0.00. std=1.00
Feature - alcalinity of ash. mean=-0.00. std=1.00
Feature - magnesium. mean=0.00. std=1.00
Feature - total phenols. mean=0.00. std=1.00
Feature - flavanoids. mean=-0.00. std=1.00
Feature - nonflavanoid phenols. mean=-0.00. std=1.00
Feature - proanthocyanins. mean=0.00. std=1.00
Feature - color intensity. mean=-0.00. std=1.00
Feature - hue. mean=0.00. std=1.00
Feature - OD280/OD315 of diluted wines. mean=-0.00. std=1.00
Feature - proline. mean=0.00. std=1.00


In [96]:
get_stats(ds_scaled['test'])

Feature - alcohol. mean=-0.05. std=1.08
Feature - malic acid. mean=-0.00. std=1.07
Feature - ash. mean=0.10. std=1.14
Feature - alcalinity of ash. mean=0.11. std=1.15
Feature - magnesium. mean=-0.01. std=1.01
Feature - total phenols. mean=-0.11. std=0.87
Feature - flavanoids. mean=-0.02. std=1.14
Feature - nonflavanoid phenols. mean=0.21. std=1.04
Feature - proanthocyanins. mean=-0.14. std=0.83
Feature - color intensity. mean=-0.09. std=0.86
Feature - hue. mean=0.20. std=1.05
Feature - OD280/OD315 of diluted wines. mean=-0.03. std=0.97
Feature - proline. mean=0.01. std=0.94


In [100]:
ds_scaled.set_format(type='pandas')
ds_scaled['train'][:].head()

Unnamed: 0,class,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD280/OD315 of diluted wines,proline
0,0,1.514148,-0.683595,0.445397,-0.915288,0.607793,1.595514,1.97387,-0.315025,0.440211,1.548675,1.227234,0.297895,3.041067
1,1,-0.86621,0.437248,-0.517934,-0.454507,-0.803598,0.245885,0.237371,-0.882298,0.662605,-1.210121,0.874948,0.980399,-1.437042
2,1,-0.765985,-0.647438,-0.221524,1.54221,-0.803598,-0.119309,0.444097,0.333287,0.50864,-1.222757,-0.269983,0.241019,-1.269516
3,0,1.301168,-0.602243,-0.554985,-1.068882,-0.239042,0.563445,0.320061,-0.801259,0.645498,-0.136086,0.390554,1.392745,0.963095
4,1,-0.953908,-0.954766,-1.555367,-0.14732,-0.52132,0.102983,0.020309,0.252248,0.81657,-0.978467,-0.40209,0.582271,-1.366166


In [101]:
ds_scaled.reset_format()

In [102]:
ds_scaled.save_to_disk('wine_ds')

Saving the dataset (0/1 shards):   0%|          | 0/142 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/28 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8 [00:00<?, ? examples/s]