## Data Encoding
* Nominal / OHE encoding 
* Label and ordinal encoding
* target Guided ordinal encoding

#### Nominal/One Hot encoding

In [2]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [3]:
    #create simple datafram

df = pd.DataFrame({
    'color':['red','green','blue','green','red','blue']
})

In [4]:
df.head()

Unnamed: 0,color
0,red
1,green
2,blue
3,green
4,red


In [5]:
# create an instance of one hot encoder
encoder = OneHotEncoder()

In [6]:
## perform fit and transform
encoded = encoder.fit_transform(df[['color']]).toarray()

In [7]:
encoder_df = pd.DataFrame(encoded,columns = encoder.get_feature_names_out())

In [8]:
encoder_df.head()

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0


In [9]:
## for new data
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [10]:
pd.concat([df,encoder_df],axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,green,0.0,1.0,0.0
2,blue,1.0,0.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


### Label Encoding


In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit_transform(df[['color']])

  y = column_or_1d(y, warn=True)


array([2, 1, 0, 1, 2, 0])

In [13]:
lbl_encoder.transform([['red']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([2])

In [14]:
lbl_encoder.transform([['blue']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([0])

In [15]:
lbl_encoder.transform([['green']])

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


array([1])

### Ordinal Encoding

In [16]:
from sklearn.preprocessing import OrdinalEncoder

In [17]:
df = pd.DataFrame({
    "size":["small",'medium','large','small','large']
})

In [18]:
df


Unnamed: 0,size
0,small
1,medium
2,large
3,small
4,large


In [19]:
## create an instance of ordinal encoder then perform , fit_transform

encoder = OrdinalEncoder(categories =[['small','medium','large']] )#In specific order

In [20]:
encoder.fit_transform(df[['size']])

array([[0.],
       [1.],
       [2.],
       [0.],
       [2.]])

In [21]:
encoder.transform([['small']])



array([[0.]])

### Target guided ordinal encoding

In [22]:
import pandas as pd

df = pd.DataFrame(
    {
        'city':['New York','London','Paris','Tokyo','New York','Paris'],
        'price':[200,150,300,250,180,320]
    }
)

In [23]:
df

Unnamed: 0,city,price
0,New York,200
1,London,150
2,Paris,300
3,Tokyo,250
4,New York,180
5,Paris,320


In [24]:
mean_price = df.groupby('city')['price'].mean().to_dict()

In [25]:
df['city_encoded'] = df['city'].map(mean_price)

In [26]:
df[['city','city_encoded']]

Unnamed: 0,city,city_encoded
0,New York,190.0
1,London,150.0
2,Paris,310.0
3,Tokyo,250.0
4,New York,190.0
5,Paris,310.0
