In [1]:
import pandas as pd
import numpy as np

### one-hot encoding

In [2]:
edges = pd.DataFrame({'source':[0,1,2],
                     'target':[2,2,3],
                     'weight':[3,4,5],
                     'color':['red','blue','blue']})

edges

Unnamed: 0,source,target,weight,color
0,0,2,3,red
1,1,2,4,blue
2,2,3,5,blue


In [3]:
edges['color']

0     red
1    blue
2    blue
Name: color, dtype: object

In [4]:
pd.get_dummies(edges)

Unnamed: 0,source,target,weight,color_blue,color_red
0,0,2,3,0,1
1,1,2,4,1,0
2,2,3,5,1,0


In [5]:
pd.get_dummies(edges['color'])

Unnamed: 0,blue,red
0,0,1
1,1,0
2,1,0


In [6]:
pd.get_dummies(edges[['color']])

Unnamed: 0,color_blue,color_red
0,0,1
1,1,0
2,1,0


In [7]:
weight_dict = {3:"M", 4:"L", 5:"XL"}
edges["weight_sign"] = edges["weight"].map(weight_dict)
edges

Unnamed: 0,source,target,weight,color,weight_sign
0,0,2,3,red,M
1,1,2,4,blue,L
2,2,3,5,blue,XL


In [8]:
weight_sign = pd.get_dummies(edges['weight_sign'])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [9]:
pd.concat([edges,weight_sign], axis=1)

Unnamed: 0,source,target,weight,color,weight_sign,L,M,XL
0,0,2,3,red,M,0,1,0
1,1,2,4,blue,L,1,0,0
2,2,3,5,blue,XL,0,0,1


In [10]:
pd.get_dummies(edges).values

array([[0, 2, 3, 0, 1, 0, 1, 0],
       [1, 2, 4, 1, 0, 1, 0, 0],
       [2, 3, 5, 1, 0, 0, 0, 1]])

### data binning

In [11]:
raw_data = {'regiment':['Nighthawks','Nighthawks','Nighthawks','Nighthawks','Dragoons','Dragoons','Dragoons','Dragoons','Scout','Scout','Scout','Scout'],
           'company':['1st','1st','2nd','2nd','1st','1st','2nd','2nd','1st','1st','2nd','2nd'],
           'name':['Miller','Jacobson','Ali','Milner','Cooze','Jacon','Ryaner','Sone','Sloan','Piger','Riani','Ali'],
           'PreTestScore':[4,24,31,2,3,4,24,31,2,3,2,3],
           'postTestScore':[25,94,57,62,70,25,94,57,62,70,62,40]}
df = pd.DataFrame(raw_data,columns=['regiment','company','name','preTestScore','postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,,25
1,Nighthawks,1st,Jacobson,,94
2,Nighthawks,2nd,Ali,,57
3,Nighthawks,2nd,Milner,,62
4,Dragoons,1st,Cooze,,70
5,Dragoons,1st,Jacon,,25
6,Dragoons,2nd,Ryaner,,94
7,Dragoons,2nd,Sone,,57
8,Scout,1st,Sloan,,62
9,Scout,1st,Piger,,70


In [12]:
bins = [0,25,50,75,100]
group_names = ['Low','Okay','Good','Great']
categories = pd.cut(df['postTestScore'],bins,labels=group_names)
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Okay
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

In [13]:
df['categories'] = pd.cut(df['postTestScore'],bins,labels=group_names)
pd.value_counts(df['categories'])

Good     7
Great    2
Low      2
Okay     1
Name: categories, dtype: int64

In [14]:
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore,categories
0,Nighthawks,1st,Miller,,25,Low
1,Nighthawks,1st,Jacobson,,94,Great
2,Nighthawks,2nd,Ali,,57,Good
3,Nighthawks,2nd,Milner,,62,Good
4,Dragoons,1st,Cooze,,70,Good
5,Dragoons,1st,Jacon,,25,Low
6,Dragoons,2nd,Ryaner,,94,Great
7,Dragoons,2nd,Sone,,57,Good
8,Scout,1st,Sloan,,62,Good
9,Scout,1st,Piger,,70,Good


In [15]:
pd.get_dummies(df)

Unnamed: 0,postTestScore,regiment_Dragoons,regiment_Nighthawks,regiment_Scout,company_1st,company_2nd,name_Ali,name_Cooze,name_Jacobson,name_Jacon,...,name_Milner,name_Piger,name_Riani,name_Ryaner,name_Sloan,name_Sone,categories_Low,categories_Okay,categories_Good,categories_Great
0,25,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,94,0,1,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,57,0,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,62,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,70,1,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
5,25,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
6,94,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
7,57,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
8,62,0,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
9,70,0,0,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0


### using scikit-learn preprocessing

In [16]:
raw_example = df.values
raw_example[:3]

array([['Nighthawks', '1st', 'Miller', nan, 25, 'Low'],
       ['Nighthawks', '1st', 'Jacobson', nan, 94, 'Great'],
       ['Nighthawks', '2nd', 'Ali', nan, 57, 'Good']], dtype=object)

In [17]:
data = raw_example.copy()

In [18]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(raw_example[:,0])
le.transform(raw_example[:,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2])

In [19]:
data[:,0] = le.transform(raw_example[:,0])
data[:3]

array([[1, '1st', 'Miller', nan, 25, 'Low'],
       [1, '1st', 'Jacobson', nan, 94, 'Great'],
       [1, '2nd', 'Ali', nan, 57, 'Good']], dtype=object)

In [20]:
label_column = [0,1,2,5]
label_encoder_list = []
for column_index in label_column:
    le = preprocessing.LabelEncoder()
    le.fit(raw_example[:,column_index])
    data[:,column_index] = le.transform(raw_example[:,column_index])
    label_encoder_list.append(le)
    del le
data[:3]

array([[1, 0, 4, nan, 25, 2],
       [1, 0, 2, nan, 94, 1],
       [1, 1, 0, nan, 57, 0]], dtype=object)

In [21]:
label_encoder_list[0].transform(raw_example[:10,0])

array([1, 1, 1, 1, 0, 0, 0, 0, 2, 2])

### one-hot encoding by sklearn

In [22]:
one_hot_enc = preprocessing.OneHotEncoder(categories='auto')
data[:,0].reshape(-1,1)

array([[1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [2],
       [2],
       [2],
       [2]], dtype=object)

In [23]:
one_hot_enc.fit(data[:,0].reshape(-1,1))

OneHotEncoder(categorical_features=None, categories='auto',
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True)

In [24]:
onehotlabels = one_hot_enc.transform(data[:,0].reshape(-1,1)).toarray()
onehotlabels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])