In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = {'temperature':['hot','cold','very hot','warm','hot','warm','warm','hot','hot','cold'],
     'color':['red','yellow','blue','blue','red','yellow','red','yellow','yellow','yellow'],
      'target':[1,1,1,0,1,0,1,0,1,1]}

In [3]:
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,temperature,color,target
0,hot,red,1
1,cold,yellow,1
2,very hot,blue,1
3,warm,blue,0
4,hot,red,1
5,warm,yellow,0
6,warm,red,1
7,hot,yellow,0
8,hot,yellow,1
9,cold,yellow,1


In [5]:
pd.get_dummies(df,prefix='temp',columns=['temperature'])

Unnamed: 0,color,target,temp_cold,temp_hot,temp_very hot,temp_warm
0,red,1,0,1,0,0
1,yellow,1,1,0,0,0
2,blue,1,0,0,1,0
3,blue,0,0,0,0,1
4,red,1,0,1,0,0
5,yellow,0,0,0,0,1
6,red,1,0,0,0,1
7,yellow,0,0,1,0,0
8,yellow,1,0,1,0,0
9,yellow,1,1,0,0,0


In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
one_hot = OneHotEncoder()

In [17]:
df['temperature'].values.reshape(-1,1)

array([['hot'],
       ['cold'],
       ['very hot'],
       ['warm'],
       ['hot'],
       ['warm'],
       ['warm'],
       ['hot'],
       ['hot'],
       ['cold']], dtype=object)

In [21]:
ohe = one_hot.fit_transform(df['temperature'].values.reshape(-1,1)).toarray()
ohe

array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.]])

In [29]:
temp_df = pd.DataFrame(ohe,columns=['temp_'+c for c in one_hot.categories_[0]])

In [37]:
dfh = pd.concat([df,temp_df],axis=1)

In [39]:
dfh.drop(columns=['temperature'],inplace=True)

In [40]:
dfh

Unnamed: 0,color,target,temp_cold,temp_hot,temp_very hot,temp_warm
0,red,1,0.0,1.0,0.0,0.0
1,yellow,1,1.0,0.0,0.0,0.0
2,blue,1,0.0,0.0,1.0,0.0
3,blue,0,0.0,0.0,0.0,1.0
4,red,1,0.0,1.0,0.0,0.0
5,yellow,0,0.0,0.0,0.0,1.0
6,red,1,0.0,0.0,0.0,1.0
7,yellow,0,0.0,1.0,0.0,0.0
8,yellow,1,0.0,1.0,0.0,0.0
9,yellow,1,1.0,0.0,0.0,0.0


In [41]:
from sklearn.preprocessing import LabelEncoder

In [43]:
lb_encoder = LabelEncoder()
lb_encoder.fit_transform(df.temperature)

array([1, 0, 2, 3, 1, 3, 3, 1, 1, 0])

In [48]:
df['temp_label_encoded'] = lb_encoder.transform(df.temperature)

In [49]:
df

Unnamed: 0,temperature,color,target,temp_label_encoded
0,hot,red,1,1
1,cold,yellow,1,0
2,very hot,blue,1,2
3,warm,blue,0,3
4,hot,red,1,1
5,warm,yellow,0,3
6,warm,red,1,3
7,hot,yellow,0,1
8,hot,yellow,1,1
9,cold,yellow,1,0


In [60]:
pd.factorize(df.temperature)

(array([0, 1, 2, 3, 0, 3, 3, 0, 0, 1]),
 Index(['hot', 'cold', 'very hot', 'warm'], dtype='object'))

In [61]:
temp_dict = {'cold':1, 'warm':2, 'hot':3, 'very hot':4}

In [63]:
df['temp_ordinal'] = df.temperature.map(temp_dict)

In [64]:
df

Unnamed: 0,temperature,color,target,temp_label_encoded,temp_ordinal
0,hot,red,1,1,3
1,cold,yellow,1,0,1
2,very hot,blue,1,2,4
3,warm,blue,0,3,2
4,hot,red,1,1,3
5,warm,yellow,0,3,2
6,warm,red,1,3,2
7,hot,yellow,0,1,3
8,hot,yellow,1,1,3
9,cold,yellow,1,0,1


In [66]:
import category_encoders as ce

In [78]:
encoder = ce.HelmertEncoder(cols=['temperature'],drop_invariant=True)

In [79]:
encoder

HelmertEncoder(cols=['temperature'], drop_invariant=True,
               handle_missing='value', handle_unknown='value', mapping=None,
               return_df=True, verbose=0)

In [80]:
encoder.fit_transform(df['temperature'])

Unnamed: 0,temperature_0,temperature_1,temperature_2
0,-1.0,-1.0,-1.0
1,1.0,-1.0,-1.0
2,0.0,2.0,-1.0
3,0.0,0.0,3.0
4,-1.0,-1.0,-1.0
5,0.0,0.0,3.0
6,0.0,0.0,3.0
7,-1.0,-1.0,-1.0
8,-1.0,-1.0,-1.0
9,1.0,-1.0,-1.0


In [81]:
pd.concat([df.iloc[:,:3],encoder.fit_transform(df['temperature'])],axis=1)

Unnamed: 0,temperature,color,target,intercept,temperature_0,temperature_1,temperature_2
0,hot,red,1,1,-1.0,-1.0,-1.0
1,cold,yellow,1,1,1.0,-1.0,-1.0
2,very hot,blue,1,1,0.0,2.0,-1.0
3,warm,blue,0,1,0.0,0.0,3.0
4,hot,red,1,1,-1.0,-1.0,-1.0
5,warm,yellow,0,1,0.0,0.0,3.0
6,warm,red,1,1,0.0,0.0,3.0
7,hot,yellow,0,1,-1.0,-1.0,-1.0
8,hot,yellow,1,1,-1.0,-1.0,-1.0
9,cold,yellow,1,1,1.0,-1.0,-1.0


In [83]:
target_enc = ce.TargetEncoder()

In [85]:
pd.concat([df,target_enc.fit_transform(df['temperature'],df['target'])],axis=1)


Unnamed: 0,temperature,color,target,temp_label_encoded,temp_ordinal,temperature.1
0,hot,red,1,1,3,0.747629
1,cold,yellow,1,0,1,0.919318
2,very hot,blue,1,2,4,0.7
3,warm,blue,0,3,2,0.377041
4,hot,red,1,1,3,0.747629
5,warm,yellow,0,3,2,0.377041
6,warm,red,1,3,2,0.377041
7,hot,yellow,0,1,3,0.747629
8,hot,yellow,1,1,3,0.747629
9,cold,yellow,1,0,1,0.919318


In [86]:
df

Unnamed: 0,temperature,color,target,temp_label_encoded,temp_ordinal
0,hot,red,1,1,3
1,cold,yellow,1,0,1
2,very hot,blue,1,2,4
3,warm,blue,0,3,2
4,hot,red,1,1,3
5,warm,yellow,0,3,2
6,warm,red,1,3,2
7,hot,yellow,0,1,3
8,hot,yellow,1,1,3
9,cold,yellow,1,0,1


In [107]:
df[['temperature','target']].groupby(by='temperature').sum()

Unnamed: 0_level_0,target
temperature,Unnamed: 1_level_1
cold,2
hot,3
very hot,1
warm,1


In [108]:
df['target'].sum()

7

In [337]:
temp_df = df[['temperature','target']].groupby(by='temperature').mean()#agg(['sum','count']).apply(lambda x: x[0]/x[1], axis=1)

In [338]:
temp_df

Unnamed: 0_level_0,target
temperature,Unnamed: 1_level_1
cold,1.0
hot,0.75
very hot,1.0
warm,0.333333


In [339]:
df.merge(temp_df,how='left',on='temperature')

Unnamed: 0,temperature,color,target_x,temp_label_encoded,temp_ordinal,target_y
0,hot,red,1,1,3,0.75
1,cold,yellow,1,0,1,1.0
2,very hot,blue,1,2,4,1.0
3,warm,blue,0,3,2,0.333333
4,hot,red,1,1,3,0.75
5,warm,yellow,0,3,2,0.333333
6,warm,red,1,3,2,0.333333
7,hot,yellow,0,1,3,0.75
8,hot,yellow,1,1,3,0.75
9,cold,yellow,1,0,1,1.0


In [340]:
df.select_dtypes(exclude=np.number)

Unnamed: 0,temperature,color
0,hot,red
1,cold,yellow
2,very hot,blue
3,warm,blue
4,hot,red
5,warm,yellow
6,warm,red
7,hot,yellow
8,hot,yellow
9,cold,yellow


In [341]:
df.select_dtypes(include=object)

Unnamed: 0,temperature,color
0,hot,red
1,cold,yellow
2,very hot,blue
3,warm,blue
4,hot,red
5,warm,yellow
6,warm,red
7,hot,yellow
8,hot,yellow
9,cold,yellow


In [146]:
data = pd.read_csv('https://stats.idre.ucla.edu/stat/data/hsb2.csv')

In [147]:
data.head()

Unnamed: 0,id,female,race,ses,schtyp,prog,read,write,math,science,socst
0,70,0,4,1,1,1,57,52,41,47,57
1,121,1,4,2,1,3,68,59,53,63,61
2,86,0,4,3,1,1,44,33,54,58,31
3,141,0,4,3,1,3,63,44,47,53,56
4,172,0,4,2,1,2,47,52,57,53,61


In [148]:
dic_map = {1:"Hispanic", 2:"Asian", 3:"African-Am", 4:"Caucasian"}

In [149]:
data.insert(3,'race_cat',data['race'].map(dic_map))

In [150]:
data = data[['race_cat','write']]

In [151]:
(data.groupby('race_cat').sum()/data['write'].sum()).index

Index(['African-Am', 'Asian', 'Caucasian', 'Hispanic'], dtype='object', name='race_cat')

In [152]:
data['write'].sum()

10555

In [153]:
import category_encoders as ce

In [154]:
james = ce.james_stein.JamesSteinEncoder(cols='race_cat', model='independent',sigma=0.5)

In [163]:
c = james.fit_transform(data['race_cat'], data['write']).rename(columns={'race_cat':'race_num'})

In [164]:
c

Unnamed: 0,race_num
0,53.848810
1,53.848810
2,53.848810
3,53.848810
4,53.848810
...,...
195,57.286144
196,53.848810
197,53.848810
198,53.848810


In [166]:
data = pd.concat([data,c], axis=1)

In [167]:
data

Unnamed: 0,race_cat,write,race_num
0,Caucasian,52,53.848810
1,Caucasian,59,53.848810
2,Caucasian,33,53.848810
3,Caucasian,44,53.848810
4,Caucasian,52,53.848810
...,...,...,...
195,Asian,59,57.286144
196,Caucasian,46,53.848810
197,Caucasian,41,53.848810
198,Caucasian,62,53.848810


In [168]:
james.get_params()

{'cols': ['race_cat'],
 'drop_invariant': False,
 'handle_missing': 'value',
 'handle_unknown': 'value',
 'model': 'independent',
 'random_state': None,
 'randomized': False,
 'return_df': True,
 'sigma': 0.5,
 'verbose': 0}