In [1]:
import numpy as np
from sklearn import preprocessing

In [2]:
data = np.array([[3, -1.5,  2, -5.4], [0,  4,  -0.3, 2.1], [1,  3.3, -1.9, -4.3]])

In [3]:
data

array([[ 3. , -1.5,  2. , -5.4],
       [ 0. ,  4. , -0.3,  2.1],
       [ 1. ,  3.3, -1.9, -4.3]])

In [4]:
data.shape

(3, 4)

In [5]:
print("Mean: ",data.mean(axis=0))
print("Standard Deviation: ",data.std(axis=0))

Mean:  [ 1.33333333  1.93333333 -0.06666667 -2.53333333]
Standard Deviation:  [1.24721913 2.44449495 1.60069429 3.30689515]


In [6]:
import math

In [7]:
math.sqrt((math.pow((3-1.33333333  ),2)+math.pow((0-1.33333333  ),2)+math.pow((1-1.33333333  ),2))/3)

1.247219128924647

In [8]:
(3-1.33333333)/1.24721913

1.3363062110825705

In [9]:
data_standardized = preprocessing.scale(data)

In [10]:
print("Mean standardized data: ",data_standardized.mean(axis=0))
print("Standard Deviation standardized data: ",data_standardized.std(axis=0))

Mean standardized data:  [ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]
Standard Deviation standardized data:  [1. 1. 1. 1.]


In [11]:
data_standardized

array([[ 1.33630621, -1.40451644,  1.29110641, -0.86687558],
       [-1.06904497,  0.84543708, -0.14577008,  1.40111286],
       [-0.26726124,  0.55907936, -1.14533633, -0.53423728]])

In [12]:
data

array([[ 3. , -1.5,  2. , -5.4],
       [ 0. ,  4. , -0.3,  2.1],
       [ 1. ,  3.3, -1.9, -4.3]])

In [13]:
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled = data_scaler.fit_transform(data)

In [14]:
data_scaled

array([[1.        , 0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.41025641, 1.        ],
       [0.33333333, 0.87272727, 0.        , 0.14666667]])

In [15]:
print("Min: ",data.min(axis=0))
print("Max: ",data.max(axis=0))

print("Min: ",data_scaled.min(axis=0))
print("Max: ",data_scaled.max(axis=0))

Min:  [ 0.  -1.5 -1.9 -5.4]
Max:  [3.  4.  2.  2.1]
Min:  [0. 0. 0. 0.]
Max:  [1. 1. 1. 1.]


In [16]:
data

array([[ 3. , -1.5,  2. , -5.4],
       [ 0. ,  4. , -0.3,  2.1],
       [ 1. ,  3.3, -1.9, -4.3]])

In [18]:
data_normalized = preprocessing.normalize(data, norm='l1',axis=0)

In [19]:
data_normalized

array([[ 0.75      , -0.17045455,  0.47619048, -0.45762712],
       [ 0.        ,  0.45454545, -0.07142857,  0.1779661 ],
       [ 0.25      ,  0.375     , -0.45238095, -0.36440678]])

In [20]:
data_norm_abs = np.abs(data_normalized)

print(data_norm_abs.sum(axis=0))

[1. 1. 1. 1.]


In [21]:
data_normalized = preprocessing.normalize(data, norm='l2',axis=0)

In [22]:
data_normalized

array([[ 0.9486833 , -0.27787309,  0.72074997, -0.74841361],
       [ 0.        ,  0.7409949 , -0.1081125 ,  0.29104974],
       [ 0.31622777,  0.61132079, -0.68471247, -0.59595899]])

In [23]:
0.31622777*3

0.9486833100000001

In [24]:
(data_normalized*data_normalized).sum(axis=0)

array([1., 1., 1., 1.])

In [25]:
0.9486833*0.9486833+0+0.31622777*0.31622777

1.000000006218063

In [26]:
data

array([[ 3. , -1.5,  2. , -5.4],
       [ 0. ,  4. , -0.3,  2.1],
       [ 1. ,  3.3, -1.9, -4.3]])

In [27]:
data_binarized = preprocessing.Binarizer(threshold=3.4).transform(data)

In [28]:
data_binarized

array([[0., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

In [36]:
data = np.array([[1, 1, 2], [0, 2, 3], [1, 0, 1], [0, 1, 0]])
print(data)

[[1 1 2]
 [0 2 3]
 [1 0 1]
 [0 1 0]]


In [37]:
encoder = preprocessing.OneHotEncoder()
encoder.fit(data)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [43]:
encoded_vector = encoder.transform([[1, 0, 3]]).toarray()
print(encoded_vector)

[[0. 1. 1. 0. 0. 0. 0. 0. 1.]]


In [44]:
encoder.transform([[1, 0, 3]])

<1x9 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [45]:
encoded_vector

array([[0., 1., 1., 0., 0., 0., 0., 0., 1.]])

In [46]:
label_encoder = preprocessing.LabelEncoder()

input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']

In [47]:
input_classes

['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']

In [48]:
label_encoder.fit(input_classes)
print("Class mapping: ")
for i, item in enumerate(label_encoder.classes_):
    print(item, "-->", i)

Class mapping: 
audi --> 0
bmw --> 1
ford --> 2
toyota --> 3


In [49]:
labels = ['toyota', 'ford', 'audi']
encoded_labels = label_encoder.transform(labels)
print("Labels =", labels)
print("Encoded labels =", list(encoded_labels))

Labels = ['toyota', 'ford', 'audi']
Encoded labels = [3, 2, 0]


In [50]:
encoded_labels = [2, 1, 0, 3, 1]
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print("Encoded labels =", encoded_labels)
print("Decoded labels =", list(decoded_labels))

Encoded labels = [2, 1, 0, 3, 1]
Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']


In [51]:
import numpy as np
import pandas as pd

In [52]:
#Importing Dataset
dataset = pd.read_csv(r'data_set\purchased.csv')

In [53]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [54]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values

In [55]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [56]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [57]:
from sklearn.impute import SimpleImputer 
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')

In [58]:
imputer=imputer.fit(X[:,1:3])
X[:,1:3]=imputer.transform(X[:,1:3])

In [59]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)