In [None]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

In [None]:
import pandas as pd
import numpy as np


In [None]:
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import f_regression, mutual_info_regression
from sklearn.feature_selection import mutual_info_regression

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

# Feature Scaling

## Standardization:
Standardization of datasets is a common requirement for many machine learning estimators implemented in scikit-learn; they might behave badly if the individual features do not more or less look like standard normally distributed data: Gaussian with zero mean and unit variance.

In practice we often ignore the shape of the distribution and just transform the data to center it by removing the mean value of each feature, then scale it by dividing non-constant features by their standard deviation.

For instance, many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector Machines or the l1 and l2 regularizers of linear models) assume that all features are centered around zero and have variance in the same order. If a feature has a variance that is orders of magnitude larger than others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected.


### Scaling to a range: MinMax Scaler
simplest method is to rescale the feature range to [0,1] or [-1, 1]  
\begin{equation}
X^\prime = \frac{X - min(X)}{max(X) - min(X)}
\end{equation}

### Scaling to zero mean, unit variance: Standard Scaler 
calculating the z-score rescale to zero mean and unit variance
\begin{equation}
X^\prime = \frac{X - \bar{X}}{\sigma}
\end{equation}

### Scaling to unit length: Normalization
rescale to unit length: useful in vector space models
\begin{equation}
X^\prime = \frac{X}{\left|\left| X \right| \right|}
\end{equation}


Note: save the scaler object that was applied on the training set, to be later re-applied on the testing set


### UCI Wine Dataset

In [None]:
df = pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,
     usecols=[0,1,2]
    )

df.columns=['Class label', 'Alcohol', 'Malic acid']

df.head()

As we can see in the table above, the features **Alcohol (percent/volumne)** and **Malic acid (g/l)** are measured on different scales, so that **Feature Scaling** is necessary and an important prior to any comparison or combination of these data.

In [None]:
std_scale = preprocessing.StandardScaler().fit(df[['Alcohol', 'Malic acid']])
df_std = std_scale.transform(df[['Alcohol', 'Malic acid']])

minmax_scale = preprocessing.MinMaxScaler().fit(df[['Alcohol', 'Malic acid']])
df_minmax = minmax_scale.transform(df[['Alcohol', 'Malic acid']])

In [None]:
print('Mean after standardization:\nAlcohol={:.2f}, Malic acid={:.2f}'
      .format(df_std[:,0].mean(), df_std[:,1].mean()))
print('\nStandard deviation after standardization:\nAlcohol={:.2f}, Malic acid={:.2f}'
      .format(df_std[:,0].std(), df_std[:,1].std()))

In [None]:
def plot():
    plt.figure(figsize=(8,6))

    plt.scatter(df['Alcohol'], df['Malic acid'],
            color='green', label='input scale', alpha=0.5)

    plt.scatter(df_std[:,0], df_std[:,1], color='red',
            label='Standardized [$ N  (\mu=0, \; \sigma=1) $]', alpha=0.3)
    
    plt.scatter(df_minmax[:,0], df_minmax[:,1],
        color='blue', label='min-max scaled [min=0, max=1]', alpha=0.3)

    plt.title('Alcohol and Malic Acid content of the wine dataset')
    plt.xlabel('Alcohol')
    plt.ylabel('Malic Acid')
    plt.legend(loc='upper left')
    plt.grid()

    plt.tight_layout()

plot()
plt.show()

#### Dividing the dataset into 70% training and 30% test dataset

In [None]:
df = pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
    header=None,
    )
X_wine = df.values[:,1:]
y_wine = df.values[:,0]

X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine,
    test_size=0.30, random_state=42)

In [None]:
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

#### Dimensionality reduction via Principal Component Analysis (PCA)
Now, we perform a PCA on the standardized and the non-standardized datasets to transform the dataset onto a 2-dimensional feature subspace.

In [None]:
# on non-standardized data
pca = PCA(n_components=2).fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

# om standardized data
pca_std = PCA(n_components=2).fit(X_train_std)
X_train_std = pca_std.transform(X_train_std)
X_test_std = pca_std.transform(X_test_std)

In [None]:
# Lets visualize the first two principle components
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10,4))


for l,c,m in zip(range(1,4), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax1.scatter(X_train[y_train==l, 0], X_train[y_train==l, 1],
        color=c,
        label='class %s' %l,
        alpha=0.5,
        marker=m
        )

for l,c,m in zip(range(1,4), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax2.scatter(X_train_std[y_train==l, 0], X_train_std[y_train==l, 1],
        color=c,
        label='class %s' %l,
        alpha=0.5,
        marker=m
        )

ax1.set_title('Transformed NON-standardized training dataset after PCA')    
ax2.set_title('Transformed standardized training dataset after PCA')    

for ax in (ax1, ax2):

    ax.set_xlabel('1st principal component')
    ax.set_ylabel('2nd principal component')
    ax.legend(loc='upper right')
    ax.grid()
plt.tight_layout()

plt.show()  

# Feature Ranking / Selection

Why Feature Selection?
 * gaining better understanding of data
 * reduce number of features, reduce overfitting
 * reduce model complexity and training time

## Univariate Feature Selection
Univariate feature selection examines each feature individually to determine the strength of the relationship of the feature with the response variable. Selection is based on best features based on univariate statistical tests.

#### SelectKBest 
    removes all but the k highest scoring features
#### SelectPercentile 
    removes all but a user-specified highest scoring percentage of features

   * For regression: f_regression, mutual_info_regression
   * For classification: chi2, f_classif, mutual_info_classif

##### Pearson's Correlation
\begin{equation}
\rho_{X,Y} = \frac{cov(X,Y)}{\sigma_x \sigma_y}
\end{equation}

In [None]:
from IPython.display import SVG, display
display(SVG(url='https://upload.wikimedia.org/wikipedia/commons/d/d4/Correlation_examples2.svg'))

##### Mutual information
\begin{equation}
I(X,Y) = \sum_{x} \sum_{y} p(x,y) \log(\frac{p(x,y)}{p(x)p(y)})
\end{equation}

In [None]:
# This example illustrates the differences between univariate F-test statistics and mutual information.
# We consider 3 features x_1, x_2, x_3 distributed uniformly over [0, 1]
# the target depends on them as follows:
# y = x_1 + sin(6 * pi * x_2) + 0.1 * N(0, 1), that is 
#the third features is completely irrelevant.
np.random.seed(0)
X = np.random.rand(1000, 3)
y = X[:, 0] + np.sin(6 * np.pi * X[:, 1]) + 0.1 * np.random.randn(1000)

f_test, _ = f_regression(X, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(X, y)
mi /= np.max(mi)

In [None]:
plt.figure(figsize=(15, 5))
for i in range(3):
    plt.subplot(1, 3, i + 1)
    plt.scatter(X[:, i], y)
    plt.xlabel("$x_{}$".format(i + 1), fontsize=14)
    if i == 0:
        plt.ylabel("$y$", fontsize=14)
    plt.title("F-test={:.2f}, MI={:.2f}".format(f_test[i], mi[i]),
              fontsize=16)
plt.show()

As **F-test** captures only linear dependency, it rates $x_1$ as the most discriminative feature. On the other hand, mutual information can capture any kind of dependency between variables and it rates $x_2$ as the most discriminative feature, which probably agrees better with our intuitive perception for this example. Both methods correctly marks $x_3$ as irrelevant.

## Lasso
L1 regularization adds a penalty $\alpha \sum_i=∣\beta_i∣$ to the loss function (L1-norm) Since each non-zero coefficient adds to the penalty, it forces weak features to have zero as coefficients. Thus L1 regularization produces sparse solutions, inherently performing feature selection.

Scikit-learn offers Lasso for linear regression and Logistic regression with L1 penalty for classification.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_boston
  
boston = load_boston()
scaler = StandardScaler()
X = scaler.fit_transform(boston["data"])
Y = boston["target"]
names = boston["feature_names"]
  
lasso = Lasso(alpha=.3)
lasso.fit(X, Y)

In [None]:
lasso.coef_

We see that a number of features have coefficient 0. If we increase α further, the solution would be sparser and sparser, i.e. more and more features would have 0 as coefficients.

## Tree based feature selection

Tree-based estimators can be used to compute feature importances. 

Every node in the decision trees is a condition on a single feature, designed to split the dataset into two so that similar response values end up in the same set. The measure based on which the (locally) optimal condition is chosen is called impurity. 

For classification, it is typically either **Gini impurity** or **information gain/entropy** and for regression trees it is **variance**. Feature importance is ranked according to this measure.

In [None]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]

In [None]:
rf = RandomForestRegressor()
rf.fit(X, Y)
rf.feature_importances_

In [None]:
sorted(zip(rf.feature_importances_, names),reverse=True)