# Feature Engineering on Numeric Data

In [9]:
##--Import necessary dependencies and settings

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import scipy.stats as spstats

mpl.style.reload_library()
mpl.style.use('classic')
mpl.rcParams['figure.facecolor'] = (1, 1, 1, 0)
mpl.rcParams['figure.figsize'] = [6.0, 4.0]
mpl.rcParams['figure.dpi'] = 100

In [1]:
##----Lets engineer on "Raw data from the Pokémon dataset"
poke_df = pd.read_csv('datasets_module_4/Pokemon.csv')
print(poke_df.head())

In [2]:
##-- showing values for just 3 important columns
print(poke_df[['HP', 'Attack', 'Defense']].head())

# These indicate each Pokémon’s HP (Hit Points), Attack, and Defense stats

In [3]:
#--we can also compute some basic statistical measures on these fields
print(poke_df[['HP', 'Attack', 'Defense']].describe())

In [4]:
#--working now on a different dataset
popsong_df = pd.read_csv('datasets_module_4/song_views.csv')
print(popsong_df.head(10))

# sample of data from the million-song dataset, which depicts 
# counts or frequencies of songs that have been heard by various users.

# Binarization

In [5]:
#--We can binarize our listen_count field from our earlier dataset
watched = np.array(popsong_df['listen_count'])
watched[watched >= 1] = 1
popsong_df['watched'] = watched
print(popsong_df.head(10))

# Rounding
Often when dealing with numeric attributes like proportions or percentages, we may not need values with a high amount of precision. Hence it makes sense to round off these high precision percentages into numeric integers.

In [None]:
items_popularity = pd.read_csv('datasets_module_4/item_popularity.csv')

# rounding off percentages
items_popularity['popularity_scale_10'] = np.array(np.round((items_popularity['pop_percent'] * 10)), dtype='int')
items_popularity['popularity_scale_100'] = np.array(np.round((items_popularity['pop_percent'] * 100)), dtype='int')

print(items_popularity)

# Binning

In [6]:
#--Binning
# -- dataset for binning
fcc_survey_df = pd.read_csv('datasets_module_4/fcc_2016_coder_survey_subset.csv')
print(fcc_survey_df[['ID.x', 'EmploymentField', 'Age', 'Income']].head())

In [7]:
##---Fixed-width binning
##---Developer age distribution
fig, ax = plt.subplots()
fcc_survey_df['Age'].hist(color='#A9C5D3')  # default no. of bins = 10
ax.set_title('Developer Age Histogram', fontsize=12)
ax.set_xlabel('Age', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
plt.show()

In [8]:
# ### Binning based on rounding
# 
# ``` 
# Age Range: Bin
# ---------------
#  0 -  9  : 0
# 10 - 19  : 1
# 20 - 29  : 2
# 30 - 39  : 3
# 40 - 49  : 4
# 50 - 59  : 5
# 60 - 69  : 6
#   ... and so on
# ```

fcc_survey_df['Age_bin_round'] = np.array(np.floor(np.array(fcc_survey_df['Age']) / 10.))
fcc_survey_df[['ID.x', 'Age', 'Age_bin_round']].iloc[1071:1076]

In [9]:
# ### Binning based on custom ranges
# 
# ``` 
# Age Range : Bin
# ---------------
#  0 -  15  : 1
# 16 -  30  : 2
# 31 -  45  : 3
# 46 -  60  : 4
# 61 -  75  : 5
# 75 - 100  : 6
# ```

bin_ranges = [0, 15, 30, 45, 60, 75, 100]
bin_names = [1, 2, 3, 4, 5, 6]
fcc_survey_df['Age_bin_custom_range'] = pd.cut(np.array(fcc_survey_df['Age']), bins=bin_ranges)
fcc_survey_df['Age_bin_custom_label'] = pd.cut(np.array(fcc_survey_df['Age']), bins=bin_ranges, labels=bin_names)
fcc_survey_df[['ID.x', 'Age', 'Age_bin_round', 'Age_bin_custom_range', 'Age_bin_custom_label']].iloc[1071:1076]

In [10]:
# Quantile based Binning

fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')
ax.set_title('Developer Income Histogram', fontsize=12)
ax.set_xlabel('Developer Income', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
plt.show()

# see carefully : there is a right skew with lesser
# developers earning more money and vice versa.

In [11]:
# Let’s take a 4-Quantile or a quartile based adaptive binning scheme.

quantile_list = [0, .25, .5, .75, 1.]
quantiles = fcc_survey_df['Income'].quantile(quantile_list)
print(quantiles)

In [12]:
# plot the 4 quantile histogram 

fig, ax = plt.subplots()
fcc_survey_df['Income'].hist(bins=30, color='#A9C5D3')

for quantile in quantiles:
    qvl = plt.axvline(quantile, color='r')

ax.legend([qvl], ['Quantiles'], fontsize=10)

ax.set_title('Developer Income Histogram with Quantiles', fontsize=12)
ax.set_xlabel('Developer Income', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
plt.show()

# Statistical Transformations
Let’s look at a different strategy of feature engineering on numerical data by using statistical or mathematical transformations.

In [14]:
# The log transform belongs to the power transform family of functions. 
# This function can be defined as y = logb(x) 
fcc_survey_df['Income_log'] = np.log((1+ fcc_survey_df['Income']))
fcc_survey_df[['ID.x', 'Age', 'Income', 'Income_log']].iloc[4:9]

# we are doing +1 to the 'Income' to avoid taking log of 0

In [15]:
# Let’s now plot the data distribution of this log-transformed feature
income_log_mean = np.round(np.mean(fcc_survey_df['Income_log']), 2)

fig, ax = plt.subplots()
fcc_survey_df['Income_log'].hist(bins=30, color='#A9C5D3')
plt.axvline(income_log_mean, color='r')
ax.set_title('Developer Income Histogram after Log Transform', fontsize=12)
ax.set_xlabel('Developer Income (log scale)', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.text(11.5, 450, r'$\mu$='+str(income_log_mean), fontsize=10)
plt.show()  