# Data Preparation
- Data Preprocessing
- Data Labeling

### Data Preprocessing

In [1]:
import numpy as np

In [6]:
import sklearn.preprocessing as preprocessing

$$ 
x^{'}=
[x._{1},x._{2},x._{3}]
$$

In [9]:
input_data = np.array([[2.1, -1.9, 5.5],
                      [-1.5, 2.4, 3.5],
                      [0.5, -7.9, 5.6],
                      [5.9, 2.3, -5.8]])

#### Binarization
$$
\text{for } i=1,2,3,4 \quad j=1,2,3 \\
x_{i,j} = 
\begin{cases}
1 & \quad \text{if } x_{ij} > 0.5 \\
0 & \quad \text{if } x_{ij} <= 0.5
\end{cases}
$$

In [10]:
data_binarized = preprocessing.Binarizer(threshold = 0.5).transform(input_data)
print("\nBinarized data:\n", data_binarized)


Binarized data:
 [[1. 0. 1.]
 [0. 1. 1.]
 [0. 0. 1.]
 [1. 1. 0.]]


#### Mean

In [27]:
print("Mean = ", input_data.mean(axis = 0))

Mean =  [ 1.75  -1.275  2.2  ]


#### Standard Deviation

In [12]:
print("Std deviation = ", input_data.std(axis = 0))

Std deviation =  [2.71431391 4.20022321 4.69414529]


#### Scaling to unit length
$$
x^{'}=
\frac{x-\overline{x}}{\sigma}
$$

In [16]:
data_scaled = preprocessing.scale(input_data)
print(data_scaled)

[[ 0.12894603 -0.14880162  0.70300338]
 [-1.19735598  0.8749535   0.27694073]
 [-0.46052153 -1.57729713  0.72430651]
 [ 1.52893149  0.85114524 -1.70425062]]


#### Scaled mean

In [14]:
print("Mean =", data_scaled.mean(axis=0))

Mean = [1.11022302e-16 0.00000000e+00 0.00000000e+00]


#### Scaled Standard Deviation

In [15]:
print("Std deviation =", data_scaled.std(axis = 0))

Std deviation = [1. 1. 1.]


#### Min max scaling
$$
x^{'}=
\frac{x-min(x)}{max(x)-min(x)}
$$

In [31]:
data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0,1))

In [20]:
data_scaled_minmax = data_scaler_minmax.fit_transform(input_data)

In [33]:
print ("\nMin max scaled data:\n", data_scaled_minmax)


Min max scaled data:
 [[0.48648649 0.58252427 0.99122807]
 [0.         1.         0.81578947]
 [0.27027027 0.         1.        ]
 [1.         0.99029126 0.        ]]


#### Normalization
$$
x^{'}=\frac{X-\overline{X}}{s}
$$
##### Least Absolute Deviations

In [25]:
data_normalized_l1 = preprocessing.normalize(input_data, norm = 'l1')
print("\nL1 normalized data:\n", data_normalized_l1)


L1 normalized data:
 [[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]


##### Least squares

In [34]:
# Normalize data
data_normalized_l2 = preprocessing.normalize(input_data, norm = 'l2')
print("\nL2 normalized data:\n", data_normalized_l2)


L2 normalized data:
 [[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]


### Data labeling

In [35]:
import numpy as np
from sklearn import preprocessing

In [36]:
# Sample input labels
input_labels = ['red','black','red','green','black','yellow','white']

In [37]:
# Creating the label encoder
encoder = preprocessing.LabelEncoder()
encoder.fit(input_labels)

LabelEncoder()

In [38]:
# encoding a set of labels
test_labels = ['green','red','black']
encoded_values = encoder.transform(test_labels)
print("\nLabels =", test_labels)


Labels = ['green', 'red', 'black']


In [39]:
print("Encoded values =", list(encoded_values))

Encoded values = [1, 2, 0]


In [40]:
# decoding a set of values
encoded_values = [3,0,4,1]
decoded_list = encoder.inverse_transform(encoded_values)
print("\nEncoded values =", encoded_values)


Encoded values = [3, 0, 4, 1]


In [43]:
decoded_list = [3, 0, 4, 1]
print("\nDecoded labels =", list(decoded_list))


Decoded labels = [3, 0, 4, 1]



Reference:
- https://www.tutorialspoint.com/artificial_intelligence_with_python/artificial_intelligence_with_python_data_preparation.htm
- https://en.wikipedia.org/wiki/Feature_scaling
- https://scikit-learn.org/stable/modules/classes.html
- https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9