In [142]:
import numpy as np
import pandas as pd

### Label Encoder

In [143]:
from sklearn.preprocessing import LabelEncoder # learns mappings from category -> integer

In [144]:
categories = ['Apple', 'Banana', 'Apple', 'Cherry']

In [145]:
le = LabelEncoder()
numerical = le.fit_transform(categories) # does not modify in place: need a variable
numerical

array([0, 1, 0, 2], dtype=int64)

In [146]:
le.classes_ # One le can only manage one column!

array(['Apple', 'Banana', 'Cherry'], dtype='<U6')

In [147]:
# le.transform(['Male', 'Female'])  the le obj is restricted to the ('apple','banana','cherry') -> (0,1,2) only. Attempt to label encode any other class throws an error unless you do fit_transform on that column: in that case, the le object forgets the older mappings. So this throws an error

### Alternative to label encoder

In [148]:
categorial_col = ['apples', 'bananas', 'bananas', 'cherry', 'apples']
categorial_col = pd.Series(categorial_col)

In [149]:
categorial_col.astype('category').cat.codes # treat my col as categorical and map the categories with numbers

0    0
1    1
2    1
3    2
4    0
dtype: int8

## Manipulating categorical datasets 2 ways (LabelEncoding and cat.codes)

In [150]:
# Preparing foodSeries
foods = ['Pizza', 'Lasagna', 'Canoli']
foodSeries = pd.Series(np.random.choice(foods, size = 100))
foodSeries.head()

0      Pizza
1    Lasagna
2    Lasagna
3     Canoli
4     Canoli
dtype: object

In [151]:
# Preparing Ratings
ratings = ['decent', 'good', 'very good']
ratingsSeries = pd.Series(np.random.choice(ratings , size = 100))
ratingsSeries.head()

0    very good
1       decent
2    very good
3    very good
4         good
dtype: object

In [152]:
dataDict = {
    'foods': foodSeries,
    'ratings': ratingsSeries
}

df = pd.DataFrame(dataDict)

In [153]:
df.head()

Unnamed: 0,foods,ratings
0,Pizza,very good
1,Lasagna,decent
2,Lasagna,very good
3,Canoli,very good
4,Canoli,good


## Using LabelEncoding


In [154]:
dfLE = df.copy()

In [155]:
from sklearn.model_selection import train_test_split
X_trainLE, X_testLE = train_test_split(dfLE, random_state=42, test_size = 0.3)
print(X_trainLE.shape)
print(X_testLE.shape)

(70, 2)
(30, 2)


In [156]:
X_trainLE

Unnamed: 0,foods,ratings
11,Pizza,good
47,Canoli,very good
85,Pizza,decent
28,Pizza,decent
93,Pizza,very good
...,...,...
60,Pizza,very good
71,Canoli,good
14,Lasagna,decent
92,Lasagna,decent


In [157]:
leDict = {}

### Label Encoding the training samples

In [158]:
columns = dfLE.columns # constant

In [203]:
for col in columns:
    if(f"{col}Numerical" in X_trainLE.columns):
        print(f"{col} is already encoded. Skipping")
        continue
    le = LabelEncoder()
    X_trainLE[f"{col}Numerical"] = le.fit_transform(X_trainLE[col]) # changing the X_trainLE[col] to its label encoded equivalent
    leDict[col] = le
# To prevent someone from again running this cell (which will also try to encode the encoded categories into numerical categorical (makes no sense): do if f"{col}Numerical" in X_trainLE.columns => Print ("Already encoded") beforefit transform

foods is already encoded. Skipping
ratings is already encoded. Skipping


In [204]:
X_trainLE.sample(3)

Unnamed: 0,foods,ratings,foodsNumerical,ratingsNumerical
71,Canoli,good,0,1
63,Lasagna,decent,1,0
68,Pizza,decent,2,0


In [205]:
for value in leDict.values():
    print(value.classes_)

['Canoli' 'Lasagna' 'Pizza']
['decent' 'good' 'very good']


### Using the same leDict label encoders to transform the test samples

In [206]:
for col in dfLE.columns: # or for col in leDict.keys() to prevent the same problem as above
    if(f"{col}Numerical" in X_testLE.columns):
        print(f"{col} is already encoded")
        continue
    X_testLE[f"{col}Numerical"] = leDict[col].transform(X_testLE[col])

foods is already encoded
ratings is already encoded


In [207]:
X_testLE.sample(3)

Unnamed: 0,foods,ratings,foodsNumerical,ratingsNumerical
45,Pizza,decent,2,0
12,Lasagna,very good,1,2
72,Lasagna,good,1,1


## Using cat.codes

In [208]:
dfCatCodes = df.copy()

In [209]:
colums = dfCatCodes.columns

In [166]:
# No need to split in this: because there is no *fit*_transform involved

In [211]:
for col in colums: # for col in dfCatCodes.columns won't work properly because it is getting incremented by the colNumerical column: so it also becomes a part of the iteration
    if(f"{col}Numerical" in dfCatCodes.columns):
        print(f"{col} is already encoded! SKipping")
        continue
    dfCatCodes[f"{col}Numerical"] = dfCatCodes[col].astype('category').cat.codes

foods is already encoded! SKipping
ratings is already encoded! SKipping


In [212]:
dfCatCodes.head()

Unnamed: 0,foods,ratings,foodsNumerical,ratingsNumerical
0,Pizza,very good,2,2
1,Lasagna,decent,1,0
2,Lasagna,very good,1,2
3,Canoli,very good,0,2
4,Canoli,good,0,1
