# Encode categorical features as an integer array.

The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are converted to ordinal integers.

# odrinal encoder should be used for input categorical features

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/income_evaluation.csv')
df.head()


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop([' income'], axis=1), df[' income'],
                                                           test_size=0.2, random_state=0)

In [4]:
X_train[' education'].value_counts()

 HS-grad         8450
 Some-college    5832
 Bachelors       4242
 Masters         1414
 Assoc-voc       1110
 11th             920
 Assoc-acdm       817
 10th             752
 7th-8th          526
 Prof-school      459
 9th              419
 12th             360
 Doctorate        306
 5th-6th          259
 1st-4th          139
 Preschool         43
Name:  education, dtype: int64

# The places where can assign hierchial rank, like here in education, Masters are better han education, bachelors are better than 12th etc. we use ordinal encoder

In [5]:
X_train[' education'].unique()

array([' 11th', ' HS-grad', ' Bachelors', ' Assoc-voc', ' Some-college',
       ' 9th', ' 10th', ' 12th', ' Doctorate', ' Prof-school', ' Masters',
       ' Assoc-acdm', ' 7th-8th', ' 5th-6th', ' Preschool', ' 1st-4th'],
      dtype=object)

# Let's assign them in increasing order

In [6]:
edu = [' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th', ' 9th', ' 10th',
       ' 11th', ' 12th', ' HS-grad', ' Prof-school', ' Some-college',
       ' Assoc-acdm', ' Assoc-voc',' Bachelors', ' Masters', ' Doctorate' ]

In [7]:
from sklearn.preprocessing import OrdinalEncoder

In [11]:
# we will pass edu as a list[] here in ordi, in keyword argument cat =[edu]
ordi = OrdinalEncoder(categories=[edu])

In [14]:
ordi.fit(X_train[[' education']])

# if error says expected 2D array, put the col in [[]]

OrdinalEncoder(categories=[[' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th',
                            ' 9th', ' 10th', ' 11th', ' 12th', ' HS-grad',
                            ' Prof-school', ' Some-college', ' Assoc-acdm',
                            ' Assoc-voc', ' Bachelors', ' Masters',
                            ' Doctorate']])

In [15]:
ordi.transform(X_train[[' education']])

array([[ 6.],
       [ 8.],
       [13.],
       ...,
       [10.],
       [15.],
       [10.]])

In [17]:
#let's see it in data frames
pd.DataFrame(ordi.transform(X_train[[' education']]))

Unnamed: 0,0
0,6.0
1,8.0
2,13.0
3,8.0
4,12.0
...,...
26043,14.0
26044,5.0
26045,10.0
26046,15.0


In [18]:
# Let's try ordering it on gender as well
X_train[' sex'].unique()

array([' Male', ' Female'], dtype=object)

In [27]:
gender = [' Male',' Female']

# on two categories

In [28]:
ordi = OrdinalEncoder(categories=[edu,gender])

In [29]:
ordi.fit(X_train[[' education',' sex']])

OrdinalEncoder(categories=[[' Preschool', ' 1st-4th', ' 5th-6th', ' 7th-8th',
                            ' 9th', ' 10th', ' 11th', ' 12th', ' HS-grad',
                            ' Prof-school', ' Some-college', ' Assoc-acdm',
                            ' Assoc-voc', ' Bachelors', ' Masters',
                            ' Doctorate'],
                           [' Male', ' Female']])

In [32]:
ordi.transform(X_train[[' education',' sex']])

array([[ 6.,  0.],
       [ 8.,  1.],
       [13.,  1.],
       ...,
       [10.,  1.],
       [15.,  0.],
       [10.,  0.]])

In [33]:
pd.DataFrame(ordi.transform(X_train[[' education',' sex']]))

Unnamed: 0,0,1
0,6.0,0.0
1,8.0,1.0
2,13.0,1.0
3,8.0,1.0
4,12.0,0.0
...,...,...
26043,14.0,0.0
26044,5.0,0.0
26045,10.0,1.0
26046,15.0,0.0


In [34]:
ordi.transform(X_test[[' education',' sex']])

array([[10.,  1.],
       [13.,  1.],
       [11.,  0.],
       ...,
       [13.,  0.],
       [ 8.,  0.],
       [ 8.,  0.]])