# Demonstration of sklearn dataset api
## Loaders
### Loading the iris dataset


In [1]:
from sklearn.datasets import load_iris
data = load_iris()

* It gives you a **Bunch** object (fancy dictionary), which contains several fields like this
```python
{
  'data': [[...], [...], ...],       # Features (numerical, 4 per flower), a numpy array
  'target': [0, 1, 2, ...],          # Labels (what species each flower is), a numpy array
  'target_names': ['setosa', ...],  # Names of the classes
  'feature_names': ['sepal length (cm)', ...],
  'DESCR': 'Massive description text',
  'frame': pandas.DataFrame (optional)
}
```

In [None]:
type(data)

In [None]:
data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [None]:
data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [None]:
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
type(data.data)

numpy.ndarray

In [None]:
data.data.shape # (150, 4) 150 data points and 4 features

(150, 4)

In [None]:
data.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [None]:
# to obtain the feature matrix and the label vector
feature_matrix, label_vector = load_iris(return_X_y=True)
print("shape of feature matrix: ", feature_matrix.shape)
print("shape of label vector: ", label_vector.shape)

shape of feature matrix:  (150, 4)
shape of label vector:  (150,)


### Loading the diabetes dataset

In [None]:
from sklearn.datasets import load_diabetes
data = load_diabetes()

In [None]:
data.data.shape # it has 442 samples with 10 features

(442, 10)

## Fetchers

In [None]:
from sklearn.datasets import fetch_california_housing
housing_data = fetch_california_housing()

In [None]:
housing_data.data.shape

(20640, 8)

### `fetch_openml`:

openml.org is a public repository for machine learning data and experiments that allows everybody to upload datasets.

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
feature_matrix, label_vector = fetch_openml("mnist_784", version = 1, return_X_y=True) # mnist_784 is the name of the data set
print("shape of feature_matrix: ", feature_matrix.shape)
print("shape of label_vector: ", label_vector.shape)

shape of feature_matrix:  (70000, 784)
shape of label_vector:  (70000,)


## Generators:
### `make_regression`

In [None]:
from sklearn.datasets import make_regression


Lets generate 100 samples with 5 features for a single label regression problem

In [None]:
X, y = make_regression(n_samples=100, n_features=5, n_targets=1, shuffle=True, random_state=42)
print(X.shape)
print(y.shape)

(100, 5)
(100,)


Lets generate 100 samples with 5 features for a multiple label regression problem with 5 outputs.

In [None]:
X, y = make_regression(n_samples=100, n_features=5, n_targets=5, shuffle=True, random_state=42)
print(X.shape)
print(y.shape)

(100, 5)
(100, 5)


### Make classification
* generate a random n-class classification problem setup

In [2]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=100, n_classes=2, n_features=10, n_clusters_per_class=1, random_state=42)
print("shape of X: ", X.shape)
print("shape of y: ", y.shape)


shape of X:  (100, 10)
shape of y:  (100,)


In [4]:
X[:5]

array([[ 0.11422765, -1.71016839, -0.06822216, -0.14928517,  0.30780177,
         0.15030176, -0.05694562, -0.22595246, -0.36361221, -0.13818757],
       [ 0.70775194, -1.57022472, -0.23503183, -0.63604713,  0.62180996,
        -0.56246678,  0.97255445, -0.77719676,  0.63240774, -0.47809669],
       [ 0.63859246,  0.04739867,  0.33273433,  1.1046981 , -0.65183611,
        -1.66152006, -1.2110162 ,  1.09821151, -0.0660798 ,  0.68024225],
       [-0.23894805, -0.97755524,  0.0379061 ,  0.19896733,  0.50091719,
        -0.90756366,  0.75539123,  0.12437227, -0.57677133,  0.07871283],
       [-0.59239392, -0.05023811,  0.17573204, -1.43949185,  0.27045683,
        -0.86399077, -0.83095012,  0.60046915,  0.04852163,  0.32557953]])

In [6]:
y[:5]

array([1, 1, 1, 1, 0])

lets create a classification with 3 classification

In [7]:
X, y = make_classification(n_samples=100, n_features=10, n_classes=3, n_clusters_per_class=1, random_state=42)
print("shape of X: ", X.shape)
print("shape of y: ", y.shape)

shape of X:  (100, 10)
shape of y:  (100,)


In [9]:
X[:5]

array([[-0.58351628, -1.73833907, -1.37298251, -1.77311485,  0.45918008,
         0.83392215, -1.66096093,  0.20768769, -0.07016571,  0.42961822],
       [-1.0044394 , -1.43862044,  0.47335819, -0.21188291,  0.0125924 ,
         0.22409248, -0.77300978,  0.49799829,  0.0976761 ,  0.02451017],
       [ 0.07740833,  0.19896733,  0.12437227,  0.17738132, -0.97755524,
         0.50091719,  0.75138712,  0.54336019,  0.09933231, -1.66940528],
       [-0.91759569, -0.9609536 ,  1.07746664,  0.4522739 , -0.32138584,
        -0.8254972 , -0.56372455,  0.24368721,  0.41293145, -0.8222204 ],
       [-0.96222828, -0.96090774,  1.21530116,  0.55980482, -1.24778318,
        -0.25256815, -1.43014138,  0.13074058,  1.6324113 , -0.44004449]])

In [11]:
y[:5]

array([2, 0, 1, 0, 0])

### `make_multilabel_classification`
lets create a multilabel classification with 100 samples, 10 features, 5 labels, and on average 2 labels per example

In [16]:
from sklearn.datasets import make_multilabel_classification

X, y = make_multilabel_classification(n_samples=100, n_features=20, n_labels=2, n_classes=5)

print("shape of X: ", X.shape)
print("shape of y: ", y.shape)

shape of X:  (100, 20)
shape of y:  (100, 5)


In [17]:
X[:5]

array([[1., 0., 2., 3., 3., 2., 4., 2., 0., 4., 0., 1., 1., 4., 1., 1.,
        3., 4., 6., 0.],
       [1., 1., 1., 2., 5., 3., 2., 3., 4., 1., 0., 1., 3., 3., 3., 4.,
        1., 2., 1., 0.],
       [3., 0., 6., 7., 3., 0., 4., 0., 2., 1., 4., 0., 0., 3., 4., 1.,
        4., 1., 0., 3.],
       [1., 0., 1., 1., 1., 3., 4., 3., 1., 4., 0., 1., 3., 3., 3., 3.,
        1., 2., 5., 1.],
       [2., 3., 2., 4., 3., 0., 3., 2., 0., 7., 3., 0., 1., 1., 3., 5.,
        2., 3., 4., 3.]])

In [18]:
y[:5]

array([[0, 0, 0, 0, 1],
       [1, 0, 0, 1, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [1, 0, 0, 0, 1]])

### `make_blobs`
make_blobs helps us generate data for clusters.

In [20]:
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=10, n_features=2, centers=3, random_state=42)
print("shape of X: ", X.shape)
print("shape of y: ", y.shape)

shape of X:  (10, 2)
shape of y:  (10,)


In [21]:
y

array([2, 2, 1, 2, 0, 0, 0, 1, 1, 0])