#Encoding Nominal Categorical Feature

**The LabelBinarizer**
used to transform multi-class labels into a one-hot encoded format.

it transforms a list of labels into a matrix of binary values, where each column represents a class and each row represents a sample.

In [None]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer
feature = np.array([
      ["Texas"],
      ["California"],
      ["Texas"],
      ["Delaware"],
      ["Texas"]
])
one_hot = LabelBinarizer()
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [None]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [None]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [None]:
import pandas as pd
pd.get_dummies(feature[:,0])


Unnamed: 0,California,Delaware,Texas
0,False,False,True
1,True,False,False
2,False,False,True
3,False,True,False
4,False,False,True


In [None]:
multiclass_feature=[
    ("Texas","Florida"),
    ("California","Alabama"),
    ("Texas","Florida"),
    ("Delaware","Florida"),
    ("Texas","Alabama")
]
one_hot_multiclass = MultiLabelBinarizer()
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [None]:
one_hot_multiclass.classes_


array(['Alabama', 'California', 'Delaware', 'Florida', 'Texas'],
      dtype=object)

# Encoding Ordinal Categorical Features

In [None]:
df=pd.DataFrame({"Score":["low" ,"low","medium","medium","high"]})
scale_mapper={"low":1,
              "medium":2,
              "high":3
}
df["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

# Encoding Dictionaries of Features

In [None]:
from sklearn.feature_extraction import DictVectorizer
data_dict = [
{"Red": 2, "Blue": 4},
{"Red": 4, "Blue": 3},
{"Red": 1, "Yellow":2},
{"Red": 2, "Yellow": 2}
]
dictvectorizer = DictVectorizer (sparse=False) #force DictVectorizer to output a
dictvectorizer.fit_transform(data_dict)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [None]:
dictvectorizer.get_feature_names_out()


array(['Blue', 'Red', 'Yellow'], dtype=object)

# Imputing Missing Class Values

In [None]:
from inspect import ClassFoundException
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
X=np.array([[0,2.10,1.45],
            [1,1.18,1.33],
            [0,1.22,1.27],
            [1,-0.21,-1.19]
])
X_with_nan = np.array([[np.nan,0.87,1.31],
                       [np.nan,-0.67,-0.22]])
clf = KNeighborsClassifier(3,weights="distance")
trained_model = clf.fit(X[:,1:],X[:,0])
imputed_values = trained_model.predict(X_with_nan[:,1:])
X_with_imputed = np.hstack((imputed_values.reshape(-1,1),X_with_nan[:,1:]))
np.vstack((X_with_man,X_with_imputed))

array([[  nan,  0.87,  1.31],
       [  nan, -0.67, -0.22],
       [ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22]])

**An alternative solution is to fill in missing values with the feature's most frequent value
**

In [None]:
from sklearn.impute import SimpleImputer
X_complete = np.vstack((X_with_nan,X))
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

# Handling Imbalanced Classes
Fisher's Iris dataset (download link 1 : https://www.kaggle.com/uciml/iris

Link2: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/)


In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
iris = load_iris()
features = iris.data
target =iris.target

#remove frist 40 observation
features = features[40:,:]
target = target[40:]
target =np.where((target==0),0,1)
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
weights = {0: .9,1:0.1}
RandomForestClassifier(class_weight=weights)

In [None]:
RandomForestClassifier(class_weight="balanced")

In [None]:
i_class0 = np.where(target==0)[0]
i_class1 = np.where(target==1)[0]

n_class0 =len(i_class0)
n_class1 = len(i_class1)
i_class1_downsampled = np.random.choice(i_class1,size=n_class0,replace=False)
np.hstack((target[i_class0],target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
np.vstack((features[i_class0,:],features[i_class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [None]:
i_class0_upsampled = np.random.choice(i_class0,size=n_class1,replace=True)
np.concatenate((target[i_class0_upsampled],target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [None]:
np.vstack((features[i_class0_upsampled,:],features[i_class1,:]))[0:5]

array([[4.6, 3.2, 1.4, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.6, 0.6],
       [5. , 3.3, 1.4, 0.2],
       [5.1, 3.8, 1.6, 0.2]])

# USING dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

# Load iris dataset into a DataFrame
df_iris = pd.DataFrame(load_iris().data, columns=load_iris().feature_names)
# Convert DataFrame to NumPy array
iris_array = df_iris.values
# Display the first few rows of the DataFrame
print(df_iris.head(2))
# Display the NumPy array
print(iris_array)

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.

In [None]:
features = np.array(iris_array)
features

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [None]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

multi_label = MultiLabelBinarizer()
multi_label.fit_transform(features)

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
multi_label.classes_

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6,
       1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9,
       3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2,
       4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
       5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8,
       6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.6, 7.7, 7.9], dtype=object)

In [None]:
import pandas as pd
from sklearn.datasets import load_iris

# Load Iris dataset into a DataFrame
iris = load_iris()
df_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df_iris["Species"] = iris.target

# Define mapper
species_mapper = {
    0: "setosa",
    1: "versicolor",
    2: "virginica"
}

# Replace 'Species' values with mapped values
df_iris["Species"].replace(species_mapper)
df_iris["Species"].head(120)

0      0
1      0
2      0
3      0
4      0
      ..
115    2
116    2
117    2
118    2
119    2
Name: Species, Length: 120, dtype: int64

In [None]:

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_extraction import DictVectorizer

# Load Iris dataset into a DataFrame
iris = load_iris()
df_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df_iris["Species"] = iris.target

# Convert DataFrame to a list of dictionaries
data_dict = df_iris.to_dict(orient='records')

# Create a dictionary vectorizer
dictvectorizer = DictVectorizer(sparse=False)

# Convert dictionary to feature matrix
features = dictvectorizer.fit_transform(data_dict)

print(features)

[[0.  1.4 0.2 5.1 3.5]
 [0.  1.4 0.2 4.9 3. ]
 [0.  1.3 0.2 4.7 3.2]
 [0.  1.5 0.2 4.6 3.1]
 [0.  1.4 0.2 5.  3.6]
 [0.  1.7 0.4 5.4 3.9]
 [0.  1.4 0.3 4.6 3.4]
 [0.  1.5 0.2 5.  3.4]
 [0.  1.4 0.2 4.4 2.9]
 [0.  1.5 0.1 4.9 3.1]
 [0.  1.5 0.2 5.4 3.7]
 [0.  1.6 0.2 4.8 3.4]
 [0.  1.4 0.1 4.8 3. ]
 [0.  1.1 0.1 4.3 3. ]
 [0.  1.2 0.2 5.8 4. ]
 [0.  1.5 0.4 5.7 4.4]
 [0.  1.3 0.4 5.4 3.9]
 [0.  1.4 0.3 5.1 3.5]
 [0.  1.7 0.3 5.7 3.8]
 [0.  1.5 0.3 5.1 3.8]
 [0.  1.7 0.2 5.4 3.4]
 [0.  1.5 0.4 5.1 3.7]
 [0.  1.  0.2 4.6 3.6]
 [0.  1.7 0.5 5.1 3.3]
 [0.  1.9 0.2 4.8 3.4]
 [0.  1.6 0.2 5.  3. ]
 [0.  1.6 0.4 5.  3.4]
 [0.  1.5 0.2 5.2 3.5]
 [0.  1.4 0.2 5.2 3.4]
 [0.  1.6 0.2 4.7 3.2]
 [0.  1.6 0.2 4.8 3.1]
 [0.  1.5 0.4 5.4 3.4]
 [0.  1.5 0.1 5.2 4.1]
 [0.  1.4 0.2 5.5 4.2]
 [0.  1.5 0.2 4.9 3.1]
 [0.  1.2 0.2 5.  3.2]
 [0.  1.3 0.2 5.5 3.5]
 [0.  1.4 0.1 4.9 3.6]
 [0.  1.3 0.2 4.4 3. ]
 [0.  1.5 0.2 5.1 3.4]
 [0.  1.3 0.3 5.  3.5]
 [0.  1.3 0.3 4.5 2.3]
 [0.  1.3 0.2 4.4 3.2]
 [0.  1.6 0