In [1]:
"""

PREPARE YOUR DATA FOR MACHINE LEARNING

"""
import pandas as pd
import scipy
import numpy
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer

data = pd.read_csv('datasets/winequality-red.csv', sep = ';')

array = data.values

X = array[:, :-1]
Y = array[:, -1]
X[:5,:]

array([[  7.40000000e+00,   7.00000000e-01,   0.00000000e+00,
          1.90000000e+00,   7.60000000e-02,   1.10000000e+01,
          3.40000000e+01,   9.97800000e-01,   3.51000000e+00,
          5.60000000e-01,   9.40000000e+00],
       [  7.80000000e+00,   8.80000000e-01,   0.00000000e+00,
          2.60000000e+00,   9.80000000e-02,   2.50000000e+01,
          6.70000000e+01,   9.96800000e-01,   3.20000000e+00,
          6.80000000e-01,   9.80000000e+00],
       [  7.80000000e+00,   7.60000000e-01,   4.00000000e-02,
          2.30000000e+00,   9.20000000e-02,   1.50000000e+01,
          5.40000000e+01,   9.97000000e-01,   3.26000000e+00,
          6.50000000e-01,   9.80000000e+00],
       [  1.12000000e+01,   2.80000000e-01,   5.60000000e-01,
          1.90000000e+00,   7.50000000e-02,   1.70000000e+01,
          6.00000000e+01,   9.98000000e-01,   3.16000000e+00,
          5.80000000e-01,   9.80000000e+00],
       [  7.40000000e+00,   7.00000000e-01,   0.00000000e+00,
          1.90

## Rescale Data

In [2]:
"""
class sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
    Transforms features by scaling each feature to a given range.
    
    This estimator scales and translates each feature individually such that it is
    in the given range on the training set, i.e. between zero and one.

Parameters:
    - feature_range : tuple (min, max), default=(0, 1)
        Desired range of transformed data.
    - copy : boolean, optional, default True
        Set to False to perform inplace row normalization and avoid a copy 
        (if the input is already a numpy array).
        
------------------------------------
Methods
    - fit(X[, y]) : Compute the minimum and maximum to be used for later scaling.
    - fit_transform(X[, y]) : Fit to data, then transform it.
    - get_params([deep]) _ Get parameters for this estimator.
    - inverse_transform(X) : Undo the scaling of X according to feature_range.
    - partial_fit(X[, y]) : Online computation of min and max on X for later scaling.
    - set_params(**params) : Set the parameters of this estimator.
    - transform(X) : Scaling features of X according to feature_range.

"""

scaler = MinMaxScaler(feature_range=(10,25))
rescaledX = scaler.fit_transform(X)
# Summarize transformed data
numpy.set_printoptions(precision=3)
print(rescaledX[0:5, :])

[[ 13.717  15.959  10.     11.027  11.603  12.113  11.484  18.513  19.094
   12.066  12.308]
 [ 14.248  17.808  10.     11.747  12.154  15.07   13.233  17.412  15.433
   13.144  13.231]
 [ 14.248  16.575  10.6    11.438  12.003  12.958  12.544  17.632  16.142
   12.874  13.231]
 [ 18.761  11.644  18.4    11.027  11.578  13.38   12.862  18.733  14.961
   12.246  13.231]
 [ 13.717  15.959  10.     11.027  11.603  12.113  11.484  18.513  19.094
   12.066  12.308]]


## Normalize Data

In [3]:
"""
    class sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
    
    Standardize features by removing the mean and scaling to unit variance
    
    Centering and scaling happen independently on each feature by computing the relevant 
    statistics on the samples in the training set. Mean and standard deviation are then stored 
    to be used on later data using the transform method.
    
    Standardization of a dataset is a common requirement for many machine learning estimators: 
    they might behave badly if the individual feature do not more or less look like standard 
    normally distributed data (e.g. Gaussian with 0 mean and unit variance).
    
    For instance many elements used in the objective function of a learning algorithm (such as the
    RBF kernel of Support Vector Machines or the L1 and L2 regularizers of linear models) assume that
    all features are centered around 0 and have variance in the same order. If a feature has a variance 
    that is orders of magnitude larger that others, it might dominate the objective function and make 
    the estimator unable to learn from other features correctly as expected.
    
    This scaler can also be applied to sparse CSR or CSC matrices by passing with_mean=False 
    to avoid breaking the sparsity structure of the data.

Parameters:
    - copy : boolean, optional, default True
        If False, try to avoid a copy and do inplace scaling instead. This is not guaranteed to 
        always work inplace; e.g. if the data is not a NumPy array or scipy.sparse CSR matrix, a 
        copy may still be returned.
    - with_mean : boolean, True by default
        If True, center the data before scaling. This does not work (and will raise an exception)
        when attempted on sparse matrices, because centering them entails building a dense matrix which 
        in common use cases is likely to be too large to fit in memory.
    - with_std : boolean, True by default
        If True, scale the data to unit variance (or equivalently, unit standard deviation).
"""

scaler = StandardScaler().fit(X)
rescaled_X = scaler.transform(X)
print(rescaled_X[0:5, :])

[[-0.528  0.962 -1.391 -0.453 -0.244 -0.466 -0.379  0.558  1.289 -0.579
  -0.96 ]
 [-0.299  1.967 -1.391  0.043  0.224  0.873  0.624  0.028 -0.72   0.129
  -0.585]
 [-0.299  1.297 -1.186 -0.169  0.096 -0.084  0.229  0.134 -0.331 -0.048
  -0.585]
 [ 1.655 -1.384  1.484 -0.453 -0.265  0.108  0.412  0.664 -0.979 -0.461
  -0.585]
 [-0.528  0.962 -1.391 -0.453 -0.244 -0.466 -0.379  0.558  1.289 -0.579
  -0.96 ]]


## Binarize Data

In [4]:
"""
class sklearn.preprocessing.Binarizer(threshold=0.0, copy=True)[source]
    
    Binarize data (set feature values to 0 or 1) according to a threshold
 
    Values greater than the threshold map to 1, while values less than or equal to the
    threshold map to 0. With the default threshold of 0, only positive values map to 1.

    Binarization is a common operation on text count data where the analyst can decide to only 
    consider the presence or absence of a feature rather than a quantified number of occurrences for instance.
    
    It can also be used as a pre-processing step for estimators that consider boolean random variables 
    (e.g. modelled using the Bernoulli distribution in a Bayesian setting).

Parameters:
    - threshold : float, optional (0.0 by default)
        Feature values below or equal to this are replaced by 0, above it by 1. Threshold 
        may not be less than 0 for operations on sparse matrices.
    - copy : boolean, optional, default True
    set to False to perform inplace binarization and avoid a copy (if the input is already a 
    numpy array or a scipy.sparse CSR matrix).
"""

binarizer = Binarizer(threshold = 3.4).fit(X)
binaryX = binarizer.transform(X)
print(binaryX[0:5, :])

[[ 1.  0.  0.  0.  0.  1.  1.  0.  1.  0.  1.]
 [ 1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.]
 [ 1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.]
 [ 1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.]
 [ 1.  0.  0.  0.  0.  1.  1.  0.  1.  0.  1.]]
