In [7]:
import sklearn.datasets as d
import sklearn.preprocessing as prp
import numpy as np

In [5]:
data = d.load_iris()

In [8]:
X = data.data
y = data.target

## OneHotEncoder (a.k.a. one-of-K encoder)

Expands matrix of integers into matrix of 0/1 dummies such that a categorical column with `n` different levels will be exapanded into `n` columns with only one `1` for every row.

In [44]:
ohe = prp.OneHotEncoder(sparse=False)
ohe

OneHotEncoder(categorical_features='all', dtype=<class 'float'>,
       handle_unknown='error', n_values='auto', sparse=False)

In [27]:
outcome = np.random.choice([1,2], 10, .5)
outcome

array([1, 1, 1, 1, 1, 2, 1, 1, 2, 1])

In [45]:
outcome_transformed = ohe.fit_transform(outcome.reshape(10,1))
outcome_transformed

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.]])

<u> Note </u>:  
transforming vector will lead to undesirable behavior:

In [34]:
ohe.fit_transform(outcome).toarray()

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.]])

Actually this is expected:  
- we have `1xn` matrix with a single unique value (either `1` or `2`) in every column.   
- that unique value is expanded into 1 column with a single `1` per column.

<u>A final note </u>:  
    
columns in transformed variable are linearly dependant. To get rid of one of the columns:

In [37]:
outcome_transformed[:,0]

array([ 1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.])

## LabelEncoder

Sometimes, instead of integers, we might have categorical variables encoded with labels

In [38]:
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']

Trying to encode the `levels` object with `OneHotEncoder` will fail. The right way to it is to send it through
`LabelEncoder` prior to `OneHotEncoder`

In [39]:
le = prp.LabelEncoder()
le

LabelEncoder()

In [42]:
labels_transformed = le.fit_transform(levels)
labels_transformed

array([4, 0, 3, 2, 1])

In [47]:
ohe.fit_transform(labels_transformed.reshape(5,1))

array([[ 0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.]])

In [61]:
import pandas as pd
import scipy.stats as st

In [63]:
st.norm().rvs(5)

array([ 1.72079999,  0.67885469,  1.22265304, -0.22847452,  1.94039691])

In [68]:
df = pd.DataFrame({'Temp': [1,2,3,5,1], 'Weather': levels, 'Wind': st.norm().rvs(5)})
df

Unnamed: 0,Temp,Weather,Wind
0,1,sunny,1.141749
1,2,cloudy,0.790371
2,3,snowy,-1.903393
3,5,rainy,0.818208
4,1,foggy,-0.085473


In [71]:
pd.get_dummies(df, columns=['Temp', 'Weather'])

Unnamed: 0,Wind,Temp_1,Temp_2,Temp_3,Temp_5,Weather_cloudy,Weather_foggy,Weather_rainy,Weather_snowy,Weather_sunny
0,1.141749,1,0,0,0,0,0,0,0,1
1,0.790371,0,1,0,0,1,0,0,0,0
2,-1.903393,0,0,1,0,0,0,0,1,0
3,0.818208,0,0,0,1,0,0,1,0,0
4,-0.085473,1,0,0,0,0,1,0,0,0


In [72]:
pwd

'/home/sergey/MachineLearning'