# 2- Data preprocessing e ingeniería de características

Data preprocessing. Estrategias y métodos a analizar en este notebook:

- Feature selection
- Feature extraction
- Standard scaler
- Min-Max scaler
- Roubst scaler

# Import necessary dependencies and settings

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
np.set_printoptions(suppress=True)
pt = np.get_printoptions()['threshold']

# Threshold based methods

## Variance based thresholding

In [19]:
pokemon_pop = pd.read_csv('./data/pokemon.csv')
print pokemon_pop.head()
print pokemon_pop.shape

   #                   Name Type 1  Type 2  Total  HP  Attack  Defense  \
0  1              Bulbasaur  Grass  Poison    318  45      49       49   
1  2                Ivysaur  Grass  Poison    405  60      62       63   
2  3               Venusaur  Grass  Poison    525  80      82       83   
3  3  VenusaurMega Venusaur  Grass  Poison    625  80     100      123   
4  4             Charmander   Fire     NaN    309  39      52       43   

   Sp. Atk  Sp. Def  Speed  Generation  Legendary  
0       65       65     45           1      False  
1       80       80     60           1      False  
2      100      100     80           1      False  
3      122      120     80           1      False  
4       60       50     65           1      False  
(800, 13)


### Label encoder, convert text variables

In [22]:
from sklearn.preprocessing import LabelEncoder

numeric_variables = ['Total','HP','Attack','Defense','Sp. Atk','Sp. Def','Speed','Generation']
text_variables = ['Type 1','Type 2','Legendary']

for var in text_variables:
    encoder = LabelEncoder()
    var_name = var+'_enc'
    pokemon_pop[var_name] = encoder.fit_transform(pokemon_pop[var])
    numeric_variables.append(var_name)
    
pokemon_pop.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Type 1_enc,Type 2_enc,Legendary_enc
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False,9,399,0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False,9,399,0
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False,9,399,0
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False,9,399,0
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False,6,340,0


In [23]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=.15)

vt.fit(df[numeric_variables])

VarianceThreshold(threshold=0.15)

In [24]:
pd.DataFrame({'variance': vt.variances_,
              'select_feature': vt.get_support()},
            index=numeric_variables).T

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Type 1_enc,Type 2_enc,Legendary_enc
select_feature,True,True,True,True,True,True,True,True,True,True,False
variance,14373.1,651.204,1052.16,971.195,1069.41,773.48,843.455,2.75644,31.1014,16218.9,0.0746484


In [26]:
poke_feat_select = pokemon_pop[numeric_variables].iloc[:,vt.get_support()].head()
poke_feat_select

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Type 1_enc,Type 2_enc
0,318,45,49,49,65,65,45,1,9,399
1,405,60,62,63,80,80,60,1,9,399
2,525,80,82,83,100,100,80,1,9,399
3,625,80,100,123,122,120,80,1,9,399
4,309,39,52,43,60,50,65,1,6,340


# Statistical Methods

In [4]:
from sklearn.datasets import load_breast_cancer

bc_data = load_breast_cancer()
bc_features = pd.DataFrame(bc_data.data, columns=bc_data.feature_names)
bc_classes = pd.DataFrame(bc_data.target, columns=['IsMalignant'])

# build featureset and response class labels 
bc_X = np.array(bc_features)
bc_y = np.array(bc_classes).T[0]
print('Feature set shape:', bc_X.shape)
print('Response class shape:', bc_y.shape)

('Feature set shape:', (569, 30))
('Response class shape:', (569,))


In [5]:
np.set_printoptions(threshold=30)
print('Feature set data [shape: '+str(bc_X.shape)+']')
print(np.round(bc_X, 2))
print('\n')
print('Feature names:')
print(np.array(bc_features.columns), '\n')
print('Predictor Class label data [shape: '+str(bc_y.shape)+']')
print(bc_y, '\n')
print('Predictor name:', np.array(bc_classes.columns))
np.set_printoptions(threshold=pt)

Feature set data [shape: (569, 30)]
[[ 17.99  10.38 122.8  ...   0.27   0.46   0.12]
 [ 20.57  17.77 132.9  ...   0.19   0.28   0.09]
 [ 19.69  21.25 130.   ...   0.24   0.36   0.09]
 ...
 [ 16.6   28.08 108.3  ...   0.14   0.22   0.08]
 [ 20.6   29.33 140.1  ...   0.26   0.41   0.12]
 [  7.76  24.54  47.92 ...   0.     0.29   0.07]]


Feature names:
(array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], 

In [6]:
from sklearn.feature_selection import chi2, SelectKBest
res = chi2(bc_X, bc_y)
print res
skb = SelectKBest(score_func=chi2, k=15)
skb.fit(bc_X, bc_y)

(array([   266.1049172 ,     93.8975081 ,   2011.10286377,  53991.65592375,
            0.14989926,      5.40307549,     19.71235355,     10.54403543,
            0.25737977,      0.00007431,     34.67524723,      0.00979354,
          250.57189636,   8758.50470533,      0.00326621,      0.61378533,
            1.04471761,      0.30523156,      0.00008036,      0.00637137,
          491.68915743,    174.44939961,   3665.03541634, 112598.43156405,
            0.39736569,     19.31492198,     39.51691507,     13.48541948,
            1.2988614 ,      0.23152241]), array([0.        , 0.        , 0.        , 0.        , 0.69863164,
       0.0201013 , 0.000009  , 0.00116564, 0.61192603, 0.99312222,
       0.        , 0.92116819, 0.        , 0.        , 0.95442512,
       0.43336612, 0.30672681, 0.58062114, 0.99284741, 0.93637975,
       0.        , 0.        , 0.        , 0.        , 0.52845287,
       0.00001108, 0.        , 0.00024042, 0.25442131, 0.63039728]))


SelectKBest(k=15, score_func=<function chi2 at 0x7f28094c2a28>)

In [7]:
feature_scores = [(item, score) for item, score in zip(bc_data.feature_names, skb.scores_)]
sorted(feature_scores, key=lambda x: -x[1])[:10]

[('worst area', 112598.43156405371),
 ('mean area', 53991.65592375093),
 ('area error', 8758.504705334482),
 ('worst perimeter', 3665.03541634059),
 ('mean perimeter', 2011.102863767906),
 ('worst radius', 491.6891574333226),
 ('mean radius', 266.1049171951782),
 ('perimeter error', 250.57189635982184),
 ('worst texture', 174.44939960571105),
 ('mean texture', 93.8975080986333)]

In [9]:
select_features_kbest = skb.get_support()
feature_names_kbest = bc_data.feature_names[select_features_kbest]
feature_subset_df = bc_features[feature_names_kbest]
bc_SX = np.array(feature_subset_df)
print(bc_SX.shape)
print(feature_names_kbest)

(569, 15)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean concavity' 'radius error' 'perimeter error' 'area error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst compactness' 'worst concavity' 'worst concave points']


In [10]:
np.round(feature_subset_df.iloc[20:25], 2)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean concavity,radius error,perimeter error,area error,worst radius,worst texture,worst perimeter,worst area,worst compactness,worst concavity,worst concave points
20,13.08,15.71,85.63,520.0,0.05,0.19,1.38,14.67,14.5,20.49,96.09,630.5,0.28,0.19,0.07
21,9.5,12.44,60.34,273.9,0.03,0.28,1.91,15.7,10.23,15.66,65.13,314.9,0.11,0.09,0.06
22,15.34,14.26,102.5,704.4,0.21,0.44,3.38,44.91,18.07,19.08,125.1,980.9,0.6,0.63,0.24
23,21.16,23.04,137.2,1404.0,0.11,0.69,4.3,93.99,29.17,35.59,188.0,2615.0,0.26,0.32,0.2
24,16.65,21.38,110.0,904.6,0.15,0.81,5.46,102.6,26.46,31.56,177.0,2215.0,0.36,0.47,0.21


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# build logistic regression model
lr = LogisticRegression()

# evaluating accuracy for model built on full featureset
full_feat_acc = np.average(cross_val_score(lr, bc_X, bc_y, scoring='accuracy', cv=5))
# evaluating accuracy for model built on selected featureset
sel_feat_acc = np.average(cross_val_score(lr, bc_SX, bc_y, scoring='accuracy', cv=5))

print('Model accuracy statistics with 5-fold cross validation')
print('Model accuracy with complete feature set', bc_X.shape, ':', full_feat_acc)
print('Model accuracy with selected feature set', bc_SX.shape, ':', sel_feat_acc)

Model accuracy statistics with 5-fold cross validation
('Model accuracy with complete feature set', (569, 30), ':', 0.9509041939207385)
('Model accuracy with selected feature set', (569, 15), ':', 0.9526433243555215)


# Recursive Feature Elimination

In [12]:
from sklearn.feature_selection import RFE

lr = LogisticRegression()
rfe = RFE(estimator=lr, n_features_to_select=15, step=1)
rfe.fit(bc_X, bc_y)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=15, step=1, verbose=0)

In [13]:
select_features_rfe = rfe.get_support()
feature_names_rfe = bc_data.feature_names[select_features_rfe]
print(feature_names_rfe)

['mean radius' 'mean texture' 'mean concavity' 'mean concave points'
 'mean symmetry' 'texture error' 'perimeter error' 'area error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst compactness'
 'worst concavity' 'worst concave points' 'worst fractal dimension']


In [14]:
set(feature_names_kbest) & set(feature_names_rfe)

{'area error',
 'mean concavity',
 'mean radius',
 'mean texture',
 'perimeter error',
 'worst compactness',
 'worst concave points',
 'worst concavity',
 'worst perimeter',
 'worst radius',
 'worst texture'}

# Model based selection

In [15]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(bc_X, bc_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
importance_scores = rfc.feature_importances_
feature_importances = [(feature, score) for feature, score in zip(bc_data.feature_names, importance_scores)]
sorted(feature_importances, key=lambda x: -x[1])[:10]

[('worst perimeter', 0.1777189006817699),
 ('mean concave points', 0.1451970273094055),
 ('worst concavity', 0.126228543556493),
 ('worst area', 0.09997817036935634),
 ('worst radius', 0.09063922010588626),
 ('area error', 0.0719445536744035),
 ('mean radius', 0.0632511809203489),
 ('perimeter error', 0.04165387775156695),
 ('worst compactness', 0.021172039801131467),
 ('mean concavity', 0.020919770054482655)]

# Feature extraction using dimensionality reduction

In [17]:
# center the feature set
bc_XC = bc_X - bc_X.mean(axis=0)

# decompose using SVD
U, S, VT = np.linalg.svd(bc_XC)

# get principal components
PC = VT.T

# get first 3 principal components
PC3 = PC[:, 0:3]
PC3.shape

(30, 3)

In [18]:
# reduce feature set dimensionality 
np.round(bc_XC.dot(PC3), 2)

array([[-1160.14,  -293.92,   -48.58],
       [-1269.12,    15.63,    35.39],
       [ -995.79,    39.16,     1.71],
       ...,
       [ -314.5 ,    47.55,    10.44],
       [-1124.86,    34.13,    19.74],
       [  771.53,   -88.64,   -23.89]])

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(bc_X)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [20]:
pca.explained_variance_ratio_

array([0.98204467, 0.01617649, 0.00155751])

In [21]:
bc_pca = pca.transform(bc_X)
np.round(bc_pca, 2)

array([[1160.14, -293.92,   48.58],
       [1269.12,   15.63,  -35.39],
       [ 995.79,   39.16,   -1.71],
       ...,
       [ 314.5 ,   47.55,  -10.44],
       [1124.86,   34.13,  -19.74],
       [-771.53,  -88.64,   23.89]])

In [22]:
np.average(cross_val_score(lr, bc_pca, bc_y, scoring='accuracy', cv=5))

0.9280800307810695

# Standard Scaler $\frac{x_i - \mu}{\sigma}$

## Load sample data of video views

In [23]:
views = pd.DataFrame([1295., 25., 19000., 5., 1., 300.], columns=['views'])
views

Unnamed: 0,views
0,1295.0
1,25.0
2,19000.0
3,5.0
4,1.0
5,300.0


In [24]:
ss = StandardScaler()
views['zscore'] = ss.fit_transform(views[['views']])
views

Unnamed: 0,views,zscore
0,1295.0,-0.307214
1,25.0,-0.489306
2,19000.0,2.231317
3,5.0,-0.492173
4,1.0,-0.492747
5,300.0,-0.449877


In [25]:
vw = np.array(views['views'])
(vw[0] - np.mean(vw)) / np.std(vw)

-0.30721413311687235

## Min-Max Scaler $\frac{x_i - min(x)}{max(x) - min(x)}$

In [26]:
mms = MinMaxScaler()
views['minmax'] = mms.fit_transform(views[['views']])
views

Unnamed: 0,views,zscore,minmax
0,1295.0,-0.307214,0.068109
1,25.0,-0.489306,0.001263
2,19000.0,2.231317,1.0
3,5.0,-0.492173,0.000211
4,1.0,-0.492747,0.0
5,300.0,-0.449877,0.015738


In [27]:
(vw[0] - np.min(vw)) / (np.max(vw) - np.min(vw))

0.06810884783409653

## Robust Scaler $\frac{x_i - median(x)}{IQR_{(1,3)}(x)}$

In [28]:
rs = RobustScaler()
views['robust'] = rs.fit_transform(views[['views']])
views

Unnamed: 0,views,zscore,minmax,robust
0,1295.0,-0.307214,0.068109,1.092883
1,25.0,-0.489306,0.001263,-0.13269
2,19000.0,2.231317,1.0,18.178528
3,5.0,-0.492173,0.000211,-0.15199
4,1.0,-0.492747,0.0,-0.15585
5,300.0,-0.449877,0.015738,0.13269


In [29]:
quartiles = np.percentile(vw, (25., 75.))
iqr = quartiles[1] - quartiles[0]
(vw[0] - np.median(vw)) / iqr

1.0928829915560916