# Feature Engineering on Categorical Data

In [1]:
# # Import necessary dependencies and settings

import pandas as pd
import numpy as np

# Transforming Nominal Features

In [1]:
# Let’s look at a new dataset pertaining to video game sales. 
# This dataset is also available on Kaggle 
# (https://www.kaggle.com/gregorut/videogamesales).

# # Transforming Nominal Features
vg_df = pd.read_csv('datasets_module_4/vgsales.csv')
print(vg_df[['Name', 'Platform', 'Year', 'Genre', 'Publisher']].iloc[1:7])

In [2]:
from sklearn.preprocessing import LabelEncoder

gle = LabelEncoder()
genre_labels = gle.fit_transform(vg_df['Genre'])
genre_mappings = {index: label for index, label in enumerate(gle.classes_)}
print(genre_mappings)

# genre_mappings = { index , value }
# here index is the labels index i.e 0,1, ..
# label in enumerate would fetch all values from gle

In [3]:
# From the output, we can see that a mapping scheme has been generated 
# where each genre value is mapped to a number with the help of the 
# LabelEncoder object gle. The transformed labels are stored in the
# genre_labels value. Let’s write it back to the original dataframe 

vg_df['GenreLabel'] = genre_labels
print(vg_df[['Name', 'Platform', 'Year', 'Genre', 'GenreLabel']].iloc[1:7])

# Transforming Ordinal Features

In [4]:
# Ordinal features are similar to nominal features except that order matters
poke_df = pd.read_csv('datasets_module_4/Pokemon.csv')
poke_df = poke_df.sample(random_state=1, frac=1)

print(np.unique(poke_df['Generation']))

In [None]:
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, 
               'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}

poke_df['GenerationLabel'] = poke_df['Generation'].map(gen_ord_map)
print(poke_df[['Name', 'Generation', 'GenerationLabel']].iloc[4:10])

# Encoding Categorical Features

# One Hot Encoding Scheme

In [5]:
# Considering we have numeric representation of any categorical feature 
# with m labels, the one hot encoding scheme, encodes or transforms 
# the feature into m binary features, which can only contain a value of 1 
# or 0. Each observation in the categorical feature is thus converted 
# into a vector of size m with only one of the values as 1 
# (indicating it as active). 
# Let’s take our Pokémon dataset and perform some one hot encoding
# transformations on some of its categorical features.

print(poke_df[['Name', 'Generation', 'Legendary']].iloc[4:10])
print("-------------------------------------------------------")
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
print(pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], axis=1).iloc[4:10])

# Feature Hashing Scheme

In [15]:
unique_genres = np.unique(vg_df[['Genre']])
print("Total game genres:", len(unique_genres))
print(unique_genres)

Total game genres: 12
['Action' 'Adventure' 'Fighting' 'Misc' 'Platform' 'Puzzle' 'Racing'
 'Role-Playing' 'Shooter' 'Simulation' 'Sports' 'Strategy']


We can clearly see from the output that there are 12 distinct genres and if we used a one hot encoding scheme on the Genre feature, we would end up having 12 binary features. Instead, we will now use a feature hashing scheme by leveraging scikit-learn's FeatureHasher class, which uses a signed 32-bit version of the Murmurhash3 hash function. The following code shows us how to use the feature hashing scheme where we will pre-set the feature vector size to be 6 (6 features instead of 12).

In [None]:
from sklearn.feature_extraction import FeatureHasher

fh = FeatureHasher(n_features=6, input_type='string')
hashed_features = fh.fit_transform(vg_df['Genre'])
hashed_features = hashed_features.toarray()
print(pd.concat([vg_df[['Name', 'Genre']], 
                 pd.DataFrame(hashed_features)], axis=1).iloc[1:7])