In [4]:
#Scales each feature (column) to a specified range, typically [0,1] or [-1,1].
#Feature-wise scaling (column-wise)
#Useful when features have different scales

from sklearn.preprocessing import MinMaxScaler

In [None]:
data = [[1, 50], [2, 30],[3,40]]
data

[[1, 50], [2, 30], [3, 40]]

In [None]:
# part 1
scaler = MinMaxScaler()

In [7]:
scaler.fit(data);

In [8]:
# Check minimum values of each feature
scaler.data_min_

array([ 1., 30.])

In [9]:
scaler.data_max_

array([ 3., 50.])

In [None]:
#normalize
#transformed values are less than or equal to 1 
scaler.transform(data)   

array([[0. , 1. ],
       [0.5, 0. ],
       [1. , 0.5]])

In [None]:
# part 2
from sklearn.preprocessing import normalize

In [13]:
data

[[1, 50], [2, 30], [3, 40]]

In [14]:
#L1 normalization: Each value is divided by the sum of absolute values of its column.

n1=normalize(data,norm='l1',axis=0)  
n1

array([[0.16666667, 0.41666667],
       [0.33333333, 0.25      ],
       [0.5       , 0.33333333]])

In [15]:
#each value in a column is divided by the Euclidean norm (square root of the sum of squares of the column).
n2=normalize(data,norm='l2',axis=0)  
n2

array([[0.26726124, 0.70710678],
       [0.53452248, 0.42426407],
       [0.80178373, 0.56568542]])

In [17]:
import pandas as pd

In [None]:
# part 3, see an example
data = {'age':[18,35,37,42,55,70],
        'salary':[1000000,2000000,2500000,3000000,4000000,7000000]}

In [19]:
df=pd.DataFrame(data)
df

Unnamed: 0,age,salary
0,18,1000000
1,35,2000000
2,37,2500000
3,42,3000000
4,55,4000000
5,70,7000000


In [20]:
scaler = MinMaxScaler()

In [21]:
%%capture
scaler.fit(df);

In [22]:
scaler.data_min_   

array([1.8e+01, 1.0e+06])

In [23]:
scaler.data_max_

array([7.e+01, 7.e+06])

In [24]:
scaler.transform(df) 

array([[0.        , 0.        ],
       [0.32692308, 0.16666667],
       [0.36538462, 0.25      ],
       [0.46153846, 0.33333333],
       [0.71153846, 0.5       ],
       [1.        , 1.        ]])

In [25]:
n1=normalize(df,norm='l1',axis=0)
n1

array([[0.07003891, 0.05128205],
       [0.13618677, 0.1025641 ],
       [0.14396887, 0.12820513],
       [0.16342412, 0.15384615],
       [0.21400778, 0.20512821],
       [0.27237354, 0.35897436]])

In [26]:
#each value in a column is divided by the Euclidean norm (square root of the sum of squares of the column).
n2=normalize(df,norm='l2',axis=0)
n2

array([[0.16031222, 0.10830607],
       [0.31171821, 0.21661214],
       [0.32953067, 0.27076518],
       [0.37406185, 0.32491822],
       [0.48984289, 0.43322429],
       [0.62343641, 0.75814251]])

In [27]:
import pandas as pd
import numpy as np

In [28]:
#example groupby
#fill nan value with mean using group by based on its category

df = pd.DataFrame({'Bird' : ['A', 'A', 'B', 'B', 'B'],'Speed' : [380, 370, 24, 26,np.nan]})
df

Unnamed: 0,Bird,Speed
0,A,380.0
1,A,370.0
2,B,24.0
3,B,26.0
4,B,


In [29]:
grouped = df.groupby('Bird')

In [30]:
print(grouped.count())

      Speed
Bird       
A         2
B         2


In [31]:
print(grouped.sum())

      Speed
Bird       
A     750.0
B      50.0


In [32]:
print(grouped.mean())

      Speed
Bird       
A     375.0
B      25.0


In [33]:
df['Speed'] = df.groupby(['Bird'])['Speed'].transform(lambda x: x.fillna(x.mean()))
df

Unnamed: 0,Bird,Speed
0,A,380.0
1,A,370.0
2,B,24.0
3,B,26.0
4,B,25.0
