<a href="https://colab.research.google.com/github/aciofo/AI-Engineering/blob/main/machine-learning-fundamentals/data-preprocessing/featuring_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
BASE_URL = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/refs/heads/main/datasets/"

In [None]:
df = pd.read_csv(BASE_URL + "shirts.csv", index_col = 0)
df.head()

Unnamed: 0,taglia,colore,prezzo
0,S,bianco,4.99
1,M,bianco,19.99
2,XL,bianco,12.49
3,XL,bianco,14.99
4,S,bianco,14.99


## Ordinal Encoding for ordinal variables

### PANDAS

In [None]:
size_mapping = {"S":1 , "M":2, "L":3, "XL":4}
df['taglia'] = df['taglia'].map(size_mapping)
df.head()

Unnamed: 0,taglia,colore,prezzo
0,1,bianco,4.99
1,2,bianco,19.99
2,4,bianco,12.49
3,4,bianco,14.99
4,1,bianco,14.99


### NUMPY

In [None]:
X = df.values
X.shape

size_mapping = {"S":1 , "M":2, "L":3, "XL":4}
fmap = np.vectorize(lambda t:size_mapping[t])
X[:,0] = fmap(X[:,0])
X[:5]

array([[1, 'bianco', 4.99],
       [2, 'bianco', 19.99],
       [4, 'bianco', 12.49],
       [4, 'bianco', 14.99],
       [1, 'bianco', 14.99]], dtype=object)

## One-hot Encoding of Categorical Variables

### PANDAS

In [None]:
df = pd.get_dummies(df, columns=['colore'], prefix = 'color', prefix_sep = '-')
df.head()

Unnamed: 0,taglia,prezzo,color-bianco,color-rosso,color-verde
0,S,4.99,True,False,False
1,M,19.99,True,False,False
2,XL,12.49,True,False,False
3,XL,14.99,True,False,False
4,S,14.99,True,False,False


### SCIKIT-LEARN

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
X = [['white'], ['red'], ['white'], ['blue'], ['red'], ['green']]

ohe = OneHotEncoder()
X_sparse = ohe.fit_transform(X)
type(X_sparse)
X = X_sparse.toarray()
X

array([[1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [None]:
ohe.categories_

[array(['bianco', 'blu', 'rosso', 'verde'], dtype=object)]

In [None]:
X = df.values

In [None]:
transf = ColumnTransformer([("ohe", OneHotEncoder(), [1])], remainder = 'passthrough')

In [None]:
X = transf.fit_transform(X)
X

## Label encoding for the target variable

In [None]:
df = pd.read_csv(BASE_URL + "shirts_sold.csv", index_col = 0)
df.head()

Unnamed: 0,taglia,colore,prezzo,venduta
0,S,bianco,4.99,NO
1,M,bianco,19.99,SI
2,XL,bianco,12.49,NO
3,XL,bianco,14.99,NO
4,S,bianco,14.99,SI


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
df['venduta'] = le.fit_transform(df['venduta'])
df.head()

Unnamed: 0,taglia,colore,prezzo,venduta
0,S,bianco,4.99,0
1,M,bianco,19.99,1
2,XL,bianco,12.49,0
3,XL,bianco,14.99,0
4,S,bianco,14.99,1


In [None]:
le.classes_

array(['NO', 'SI'], dtype=object)

In [None]:
y = [0,0,1,0,1]
y = le.inverse_transform(y)
y

array(['NO', 'NO', 'SI', 'NO', 'SI'], dtype=object)