### Preprocessing the Data

In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
input_data = np.array([[2.1, -1.9, 5.5],
                      [-1.5, 2.4, 3.5],
                      [0.5, -7.9, 5.6],
                      [5.9, 2.3, -5.8]])

### Techiniques for Data Preprocessing

In [3]:
data_binarized = preprocessing.Binarizer(threshold=0.5).transform(input_data)
print("\nBinarized data:\n", data_binarized)


Binarized data:
 [[1. 0. 1.]
 [0. 1. 1.]
 [0. 0. 1.]
 [1. 1. 0.]]


In [4]:
print("Mean =", input_data.mean(axis=0))
print("Std deviation = ", input_data.std(axis=0))

Mean = [ 1.75  -1.275  2.2  ]
Std deviation =  [2.71431391 4.20022321 4.69414529]


In [5]:
data_scaled = preprocessing.scale(input_data)
print("Mean =", data_scaled.mean(axis=0))
print("Std deviation =", data_scaled.std(axis=0))

Mean = [1.11022302e-16 0.00000000e+00 0.00000000e+00]
Std deviation = [1. 1. 1.]


#### Scaling

In [6]:
# Min max scaling

data_scaler_minmax = preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled_minmax = data_scaler_minmax.fit_transform(input_data)
print ("\nMin max scaled data:\n", data_scaled_minmax)


Min max scaled data:
 [[0.48648649 0.58252427 0.99122807]
 [0.         1.         0.81578947]
 [0.27027027 0.         1.        ]
 [1.         0.99029126 0.        ]]


#### Normalization -L1

In [7]:
# Normalize data

data_normalized_l1 = preprocessing.normalize(input_data, norm='l1')

print("\nL1 normalized data:\n", data_normalized_l1)


L1 normalized data:
 [[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]


#### Normalization -L2

In [8]:
# Normalize data

data_normalized_l2 = preprocessing.normalize(input_data, norm='l2')

print("\nL2 normalized data:\n", data_normalized_l2)


L2 normalized data:
 [[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]


### Labeling

In [9]:
# Step 1. - import the useful packages

import numpy as np
from sklearn import preprocessing

In [10]:
# Step 2. Defining sample labels

# same imput labels
input_labels = ['red','black','red','green','black','yellow','white']

In [11]:
# Step 3. Creating & training of label encoder object

# Creating the label encoder
encoder = preprocessing.LabelEncoder()
encoder.fit(input_labels)

In [12]:
# Step 4. Checking the performance by encoding random ordered list

# encoding a set of labels
test_labels = ['green','red','black']
encoded_values = encoder.transform(test_labels)

print("\nLabels =", test_labels)


Labels = ['green', 'red', 'black']


In [13]:
Labels = ['green', 'red', 'black']

In [14]:
print("Encoded values =", list(encoded_values))

Encoded values = [1, 2, 0]


In [15]:
# Step 5. Checking the performance by decoding a random set of numbers

# decoding a set of values
encoded_values = [3,0,4,1]
decoded_list = encoder.inverse_transform(encoded_values)

print("\nEncoded values =", encoded_values)


Encoded values = [3, 0, 4, 1]


In [16]:
print("\nDecoded labels =", list(decoded_list))


Decoded labels = ['white', 'black', 'yellow', 'green']


In [17]:
%reload_ext watermark
%watermark -a "Caique Miranda" -gu "caiquemiranda" -iv

Author: Caique Miranda

Github username: caiquemiranda

sys    : 3.10.5 (tags/v3.10.5:f377153, Jun  6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)]
numpy  : 1.23.0
sklearn: 1.1.1



### End.