### scikit-learn toy datasets

In [1]:
from sklearn.datasets import load_boston

boston = load_boston()
X = boston.data
Y = boston.target

print(X.shape, Y.shape)

(506, 13) (506,)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size = 0.25,
                                                    random_state = 1_000)

In [None]:
from sklearn.utils import check_random_state

rs = check_random_state(1000)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size = 0.25,
                                                    random_state = rs)

### Managing categorical data

In [None]:
import numpy as np

X = np.random.uniform(0.0, 1.0, 
                      size = (10, 2))

Y = np.random.choice(('Male','Female'), 
                     size = (10))

print(X[0], Y[0])

In [None]:
of an instance array called classes_:
from sklearn.preprocessing import LabelEncoder
>>> le = LabelEncoder()
>>> yt = le.fit_transform(Y)
>>> print(yt)
[0 0 0 1 0 1 1
 >>> le.classes_array(['Female', 'Male'], dtype='|S6')

In [None]:
The inverse transformation can be obtained in this simple way:
>>> output = [1, 0, 1, 1, 0, 0]
>>> decoded_output = [le.classes_[i] for i in output]
['Male', 'Female', 'Male', 'Male', 'Female', 'Female']

In [None]:
from sklearn.preprocessing import LabelBinarizer
>>> lb = LabelBinarizer()
>>> Yb = lb.fit_transform(Y)

In [None]:
lb.inverse_transform(Yb)

In [None]:
import numpy as np
>>> Y = lb.fit_transform(Y)
array([[0, 1, 0, 0, 0],
[0, 0, 0, 1, 0],
[1, 0, 0, 0, 0]])
>>> Yp = model.predict(X[0])
array([[0.002, 0.991, 0.001, 0.005, 0.001]])
>>> Ypr = np.round(Yp)
array([[ 0., 1., 0., 0., 0.]])
>>> lb.inverse_transform(Ypr)
array(['Female'], dtype='|S6')

In [None]:
ata = [
{ 'feature_1': 10.0, 'feature_2': 15.0 },
{ 'feature_1': -5.0, 'feature_3': 22.0 },
{ 'feature_3': -2.0, 'feature_4': 10.0 }
]

In [None]:
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
>>> dv = DictVectorizer()
>>> Y_dict = dv.fit_transform(data)
>>> Y_dict.todense()

In [None]:
>>> dv.vocabulary_

In [None]:
>>> fh = FeatureHasher()
>>> Y_hashed = fh.fit_transform(data)

In [None]:
>>> Y_hashed.todense()

In [None]:
from sklearn.preprocessing import OneHotEncoder
>>> data = [
[0, 10],
[1, 11],
[1, 8],
[0, 12],
[0, 15]
]
>>> oh = OneHotEncoder(categorical_features=[0])
>>> Y_oh = oh.fit_transform(data1)

In [None]:
>>> Y_oh.todense()
matrix([[ 1., 0.,

### Managing missing features

In [None]:
from sklearn.preprocessing import Imputer
>>> data = np.array([[1, np.nan, 2], [2, 3, np.nan], [-1, 4, 2]])
>>> imp = Imputer(strategy='mean')
>>> imp.fit_transform(data)

In [None]:
>>> imp = Imputer(strategy='median')
>>> imp.fit_transform(data)

In [None]:
>>> imp = Imputer(strategy='most_frequent')
>>> imp.fit_transform(data)

### Data scaling and normalization

In [None]:
from sklearn.preprocessing import StandardScaler
>>> ss = StandardScaler()
>>> scaled_data = ss.fit_transform(data)

In [None]:
Here are some examples with different quantiles:
from sklearn.preprocessing import RubustScaler
>>> rb1 = RobustScaler(quantile_range=(15, 85))
>>> scaled_data1 = rb1.fit_transform(data)

In [None]:
>>> rb1 = RobustScaler(quantile_range=(25, 75))
>>> scaled_data1 = rb1.fit_transform(data)

In [None]:
>>> rb2 = RobustScaler(quantile_range=(30, 60))
>>> scaled_data2 = rb2.fit_transform(data)

In [None]:
An example of every normalization is shown next:
from sklearn.preprocessing import Normalizer
>>> data = np.array([1.0, 2.0])

In [None]:
>>> n_max = Normalizer(norm='max')
>>> n_max.fit_transform(data.reshape(1, -1))

In [None]:
>> n_l1 = Normalizer(norm='l1')
>>> n_l1.fit_transform(data.reshape(1, -1))

In [None]:
>>> n_l2 = Normalizer(norm='l2')
>>> n_l2.fit_transform(data.reshape(1, -1))

### Feature selection and filtering

In [None]:
from sklearn.feature_selection import VarianceThreshold
>>> X[0:3, :]

In [None]:
>>> vt = VarianceThreshold(threshold=1.5)
>>> X_t = vt.fit_transform(X)
>>> X_t[0:3, :]

In [None]:
from sklearn.datasets import load_boston, load_iris
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, f_regression

In [None]:
>>> regr_data = load_boston()
>>> regr_data.data.shape

In [None]:
>>> kb_regr = SelectKBest(f_regression)
>>> X_b = kb_regr.fit_transform(regr_data.data, regr_data.target)

In [None]:

>>> X_b.shape

In [None]:
>>> kb_regr.scores_

In [None]:
>>> class_data = load_iris()
>>> class_data.data.shape

In [None]:
>>> perc_class = SelectPercentile(chi2, percentile=15)
>>> X_p = perc_class.fit_transform(class_data.data, class_data.target)

In [None]:
>>> X_p.shape

In [None]:
>>> perc_class.scores_

### Principal component analysis

In [None]:
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
>>> digits = load_digits()

In [None]:
>>> pca = PCA(n_components=36, whiten=True)
>>> X_pca = pca.fit_transform(digits.data / 255)

In [None]:
>>> pca.explained_variance_ratio_
array([ 0.14890594, 0.13618771,

In [None]:
X_rebuilt = pca.inverse_transform(X_pca)

### Non-negative matrix factorization

In [None]:

from sklearn.datasets import load_iris
from sklearn.decomposition import NMF
>>> iris = load_iris()
>>> iris.data.shape

In [None]:
>>> nmf = NMF(n_components=3, init='random', l1_ratio=0.1)
>>> Xt = nmf.fit_transform(iris.data)

In [None]:
>>> nmf.reconstruction_err_

In [None]:
>>> iris.data[0]

In [None]:
>>> Xt[0]

In [None]:
>>> nmf.inverse_transform(Xt[0])

### Sparse PCA

In [None]:
from sklearn.decomposition import SparsePCA
>>> spca = SparsePCA(n_components=60, alpha=0.1)
>>> X_spca = spca.fit_transform(digits.data / 255)

In [None]:
>>> spca.components_.shape

### Kernel PCA

In [None]:
Let's consider a dataset made up of a circle with a blob inside:
from sklearn.datasets import make_circles
>>> Xb, Yb = make_circles(n_samples=500, factor=0.1, noise=0.05)

In [None]:
from sklearn.decomposition import KernelPCA
>>> kpca = KernelPCA(n_components=2, kernel='rbf',
fit_inverse_transform=True, gamma=1.0)
>>> X_kpca = kpca.fit_transform(Xb)

### Atom extraction and dictionary learning

In [None]:
from sklearn.decomposition import DictionaryLearning
>>> dl = DictionaryLearning(n_components=36, fit_algorithm='lars',
transform_algorithm='lasso_lars')
>>> X_dict = dl.fit_transform(digits.data)

### End.