# Chapter 5 Handling Categorical Data

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn import feature_extraction
from sklearn.neighbors import KNeighborsClassifier
from sklearn import impute
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

## 5.1 Encoding Norminal Categorical Features

In [2]:
feature = np.array([["Texas"], ["California"], ["Texas"], ["Delaware"], ["Texas"]])
# one hot encoder
one_hot = preprocessing.LabelBinarizer()
one_hot.fit_transform(feature)
# view classes
one_hot.classes_
# reverse the one_hot encoding
one_hot.inverse_transform(one_hot.fit_transform(feature))
# create a dummy variable from feature
pd.get_dummies(feature[:, 0])

# create multiclass feature
multiclass_feature = [("Texas", "Florida"), ("California", "Alabama"), ("Texas", "Florida"), 
                      ("Delware", "Florida"), ("Texas", "Alabama")]
# create multiclass one_hot encoder
one_hot_multiclass = preprocessing.MultiLabelBinarizer()
# one_hot encode multiclass feature
one_hot_multiclass.fit_transform(multiclass_feature)
# view classes
one_hot_multiclass.classes_

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

array(['California', 'Delaware', 'Texas'], dtype='<U10')

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

## 5.2 Encoding Ordinal Categorical Feature

In [3]:
df = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})
# int, float, str are ok
scale_mapper = {"Low": 1, "Medium": 2, "High": 3}
# replace feature value with scale
df["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

## 5.3 Encoding Dictionaries of Feature

In [4]:
data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]
# enforce output to be not sparse
dictvectorize = feature_extraction.DictVectorizer(sparse=False)
pd.DataFrame(dictvectorize.fit_transform(data_dict), columns=dictvectorize.get_feature_names())

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


## 5.4 Imputing Missing Class Value

In [5]:
X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])

# train KNN learner
clf = KNeighborsClassifier(3, weights="distance")
trained_model = clf.fit(X[:, 1:], X[:, 0])
# predicting missing value
imputed_values = trained_model.predict(X_with_nan[:, 1:])
# join column of predicted class with their other feature
print(imputed_values)
X_with_imputed = np.concatenate((imputed_values.reshape(-1, 1), X_with_nan[:, 1:]), axis=1)
# join feature matrices
np.concatenate((X_with_imputed, X), axis=0)

# using imputer in sklearn
X_complete = np.concatenate((X_with_nan, X), axis=0)
simple_imputer = impute.SimpleImputer(strategy="most_frequent")
simple_imputer.fit_transform(X_complete)
knn_imputer = impute.KNNImputer(n_neighbors=3)
knn_imputer.fit_transform(X_complete)

[0. 1.]


array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

array([[ 0.33333333,  0.87      ,  1.31      ],
       [ 0.66666667, -0.67      , -0.22      ],
       [ 0.        ,  2.1       ,  1.45      ],
       [ 1.        ,  1.18      ,  1.33      ],
       [ 0.        ,  1.22      ,  1.27      ],
       [ 1.        , -0.21      , -1.19      ]])

## 5.5 Handling Imbalanced Classes

In [6]:
iris = load_iris()
feature = iris.data
target = iris.target
# remove some observations
feature = feature[40: ]
target = target[40: ]
# create binary target vector indicating if class 0
target = np.where((target == 0), 0, 1)
target
# create weight
weight = {0: 0.9, 1: 0.1}


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [7]:
# create random forest classifier with weights
RandomForestClassifier(class_weight=weight)

RandomForestClassifier(bootstrap=True, class_weight=weight,
                       criterion="gini", max_depth=None, max_features="auto",
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(class_weight={0: 0.9, 1: 0.1})

RandomForestClassifier(class_weight={0: 0.9, 1: 0.1}, n_estimators=10, n_jobs=1)

In [8]:
# train a random forest with balanced class weights
# automatically creates weights inversely proportional to class frequencies
RandomForestClassifier(class_weight="balanced")

RandomForestClassifier(bootstrap=True, class_weight="balanced",
                       criterion="gini", max_depth=None, max_features="auto",
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

RandomForestClassifier(class_weight='balanced')

RandomForestClassifier(class_weight='balanced', n_estimators=10, n_jobs=1)

In [9]:
# downsample the majority class or upsample the minority class
# here we downsample the majority class without replacement

# indicies of each class' observations
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

# number of observation in each class
n_class0 = i_class0.shape[0]
n_class1 = i_class1.shape[0]

# downsample the majority class
if n_class0 > n_class1:
    i_class0_downsampled = np.random.choice(i_class0, size=n_class1, replace=False)
    # join together class 1's traget vector with the downsampled class 0's target vector
    np.hstack((target[i_class0_downsampled], target[i_class1]))
    # feature matrix
    np.vstack((feature[i_class0_downsampled, :], feature[i_class1, :]))[0: 5]
else:
    i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)
    np.hstack((target[i_class0], target[i_class1_downsampled]))
    np.vstack((feature[i_class0, :], feature[i_class1_downsampled, :]))[0: 5]
    
# upsample the minority class
if n_class0 > n_class1:
    i_class1_upsampled = np.random.choice(i_class1, size=n_class0, replace=False)
    np.hstack((target[i_class0], target[i_class1_upsampled]))
    np.vstack((feature[i_class0, :], feature[i_class1_upsampled, :]))[0: 5]
else:
    i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)
    np.hstack((target[i_class0_upsampled], target[i_class1]))
    np.vstack((feature[i_class0_upsampled, :], feature[i_class1, :]))[0: 5]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

array([[5.3, 3.7, 1.5, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [4.8, 3. , 1.4, 0.3],
       [5.3, 3.7, 1.5, 0.2],
       [5.3, 3.7, 1.5, 0.2]])