# Label Encoding

In [22]:
import pandas as pd

df = pd.DataFrame({
    'department': ['HR', 'HR', 'Account', 'HR', 'Sales', 'Sales', 'Account'],
    'salary': [50000, 45000, 55000, 52000, 48000, 43000, 28000]
})

print(df)

  department  salary
0         HR   50000
1         HR   45000
2    Account   55000
3         HR   52000
4      Sales   48000
5      Sales   43000
6    Account   28000


In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() # create an instance
df['department_encoded'] = le.fit_transform(df['department'])

print(df)

  department  salary  department_encoded
0         HR   50000                   1
1         HR   45000                   1
2    Account   55000                   0
3         HR   52000                   1
4      Sales   48000                   2
5      Sales   43000                   2
6    Account   28000                   0


In [24]:
# Lets see the label mappings
print("\nclasses:", le.classes_)
print("\nvalues:", le.transform(le.classes_))
print("\nLabel mapping:", dict(zip(le.classes_, le.transform(le.classes_))))


classes: ['Account' 'HR' 'Sales']

values: [0 1 2]

Label mapping: {'Account': 0, 'HR': 1, 'Sales': 2}


# One Hot Encoding

In [25]:
import pandas as pd

# Create sample data
df = pd.DataFrame({
    'empID': [10, 12, 13, 14, 15, 16, 17],
    'department': ['HR', 'HR', 'Account', 'HR', 'Sales', 'Sales', 'Account'],
    'salary': [50000, 45000, 55000, 52000, 48000, 34000, 23000]
})

print(df)

   empID department  salary
0     10         HR   50000
1     12         HR   45000
2     13    Account   55000
3     14         HR   52000
4     15      Sales   48000
5     16      Sales   34000
6     17    Account   23000


In [26]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False, dtype=int)
encoded = encoder.fit_transform(df[['department']])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['department']))
print(encoded_df)

   department_Account  department_HR  department_Sales
0                   0              1                 0
1                   0              1                 0
2                   1              0                 0
3                   0              1                 0
4                   0              0                 1
5                   0              0                 1
6                   1              0                 0


In [27]:
# Combine with salary
final_df = pd.concat([encoded_df, df[['empID','salary']]], axis=1)
print(final_df)

   department_Account  department_HR  department_Sales  empID  salary
0                   0              1                 0     10   50000
1                   0              1                 0     12   45000
2                   1              0                 0     13   55000
3                   0              1                 0     14   52000
4                   0              0                 1     15   48000
5                   0              0                 1     16   34000
6                   1              0                 0     17   23000


In [None]:


apple was rotten
apple was fresh
I was disappointed by apple
apple tasted sweet 

vocabulary:
(apple, was, rotten, fresh, tasted, sweet, I, disappointed, by). 


In [13]:
# from sklearn.feature_extraction.text import CountVectorizer

# sentences = [
#     "apple was rotten",
#     "apple was fresh",
#     "I was disappointed by apple",
#     "apple tasted sweet"
# ]

# vectorizer = CountVectorizer(binary=True)

# X = vectorizer.fit_transform(sentences)

# vocab = vectorizer.get_feature_names_out()

# print("Vocabulary:", vocab)
# print("\nOne-hot vectors (sentence level):")
# print(X.toarray())


Vocabulary: ['apple' 'by' 'disappointed' 'fresh' 'rotten' 'sweet' 'tasted' 'was']

One-hot vectors (sentence level):
[[1 0 0 0 1 0 0 1]
 [1 0 0 1 0 0 0 1]
 [1 1 1 0 0 0 0 1]
 [1 0 0 0 0 1 1 0]]


In [21]:
from sklearn.preprocessing import OneHotEncoder

# Fixed vocabulary 
vocab = ['apple','was','rotten','fresh','tasted','sweet','I','disappointed','by']

encoder = OneHotEncoder(
    categories=[vocab],
    sparse_output=False,
    handle_unknown='ignore'
)

sentence = [['apple'], ['tasted'], ['rotten']] # apple was rotten

one_hot = encoder.fit_transform(sentence)
print(one_hot)

[[1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]]


# target encoding

In [15]:
import pandas as pd

# Sample data
df = pd.DataFrame({
    'city': ['New York', 'New York', 'Delhi', 'Delhi', 'Delhi', 'Bangalore'],
    'age':  [25, 30, 22, 28, 35, 26],
    'target': [1, 0, 1, 0, 1, 0]
})

print(df)

        city  age  target
0   New York   25       1
1   New York   30       0
2      Delhi   22       1
3      Delhi   28       0
4      Delhi   35       1
5  Bangalore   26       0


In [20]:
# Compute mean of target for each city
target_encoding = df.groupby('city')['target'].mean().to_dict()

# Apply target encoding
df['city_target_encoded'] = df['city'].map(target_encoding)

print(df)
print("\nEncoding mapping:", target_encoding)

        city  age  target  city_target_encoded
0   New York   25       1             0.500000
1   New York   30       0             0.500000
2      Delhi   22       1             0.666667
3      Delhi   28       0             0.666667
4      Delhi   35       1             0.666667
5  Bangalore   26       0             0.000000

Encoding mapping: {'Bangalore': 0.0, 'Delhi': 0.6666666666666666, 'New York': 0.5}


# Frequency Encoding

In [16]:
import pandas as pd

df = pd.DataFrame({
    'city': ['New York', 'New York', 'Delhi', 'Delhi', 'Delhi', 'Bangalore'],
    'age':  [25, 30, 22, 28, 35, 26],
    'target': [1, 0, 1, 0, 1, 0]
})

print(df)

        city  age  target
0   New York   25       1
1   New York   30       0
2      Delhi   22       1
3      Delhi   28       0
4      Delhi   35       1
5  Bangalore   26       0


In [19]:
# Calculate frequency of each city
freq_encoding = df['city'].value_counts().to_dict()
print("\nFrequency mapping:", freq_encoding)


Frequency mapping: {'Delhi': 3, 'New York': 2, 'Bangalore': 1}


In [20]:
# Apply frequency encoding
df['city_freq_encoded'] = df['city'].map(freq_encoding)
print(df)

        city  age  target  city_freq_encoded
0   New York   25       1                  2
1   New York   30       0                  2
2      Delhi   22       1                  3
3      Delhi   28       0                  3
4      Delhi   35       1                  3
5  Bangalore   26       0                  1
