# sklearn API Demonstration

## Recall that it has three API

      1. Loaders (load_*) - load small standard dataset bundled with sklearn
      2. Fetchers (fetch_*) - fetch large datasets from internet and load them in memory
      3. Generators(generate_*) - generate controlled synthetic datasets

    Loaders and fetchers return a `bunch` object and generators return a tuple of feature matrix and label vector

# Loaders

## Loading iris dataset

In [1]:
from sklearn.datasets import load_iris
data = load_iris()

1. This returns `bunch` object data which is dictionary like object
  1. `data` which has feature matrix
  2. `target` which is label vector
  3. `feature_names` contains names of the feature
  4. `target_names` contain names of the classes
  5. `DESCR` has full description of the dataset
  6. `filename` has path to location of the data

In [2]:
type(data)

sklearn.utils.Bunch

In [3]:
# Examine the dataset:

data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
# Examine the labels

data.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [12]:
data.data.shape

(150, 4)

In [8]:
# examine the feature matrix

data.data[:5] # load first 5 examples

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [10]:
data.data # loads all the examples

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [11]:
data.target # examine the labels- total 150 labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [13]:
?load_iris

In [15]:
# we can obtain feature matrix and label vector from load_iris and other loaders in general by setting return_X_y argument to True

feature_matrix, label_vector = load_iris(return_X_y =True)
print('feature_matrix   ', feature_matrix.shape)
print('label_vector   ', label_vector.shape)

feature_matrix    (150, 4)
label_vector    (150,)


## Loading Diabetes dataset:

In [16]:
from sklearn.datasets import load_diabetes
data = load_diabetes()

In [17]:
?load_diabetes

In [32]:
# load the dataset and obtain the bunch object

data.feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [34]:
data.target[:5] # displays first 5 labels which are integers

array([151.,  75., 141., 206., 135.])

In [37]:
data.data[:5] # displays first 5 features

array([[ 0.03807591,  0.05068012,  0.06169621,  0.02187235, -0.0442235 ,
        -0.03482076, -0.04340085, -0.00259226,  0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, -0.02632783, -0.00844872,
        -0.01916334,  0.07441156, -0.03949338, -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, -0.00567061, -0.04559945,
        -0.03419447, -0.03235593, -0.00259226,  0.00286377, -0.02593034],
       [-0.08906294, -0.04464164, -0.01159501, -0.03665645,  0.01219057,
         0.02499059, -0.03603757,  0.03430886,  0.02269202, -0.00936191],
       [ 0.00538306, -0.04464164, -0.03638469,  0.02187235,  0.00393485,
         0.01559614,  0.00814208, -0.00259226, -0.03199144, -0.04664087]])

In [22]:
data.data.shape # 442 examples and 10 features

(442, 10)

In [25]:
data.DESCR

'.. _diabetes_dataset:\n\nDiabetes dataset\n----------------\n\nTen baseline variables, age, sex, body mass index, average blood\npressure, and six blood serum measurements were obtained for each of n =\n442 diabetes patients, as well as the response of interest, a\nquantitative measure of disease progression one year after baseline.\n\n**Data Set Characteristics:**\n\n  :Number of Instances: 442\n\n  :Number of Attributes: First 10 columns are numeric predictive values\n\n  :Target: Column 11 is a quantitative measure of disease progression one year after baseline\n\n  :Attribute Information:\n      - age     age in years\n      - sex\n      - bmi     body mass index\n      - bp      average blood pressure\n      - s1      tc, total serum cholesterol\n      - s2      ldl, low-density lipoproteins\n      - s3      hdl, high-density lipoproteins\n      - s4      tch, total cholesterol / HDL\n      - s5      ltg, possibly log of serum triglycerides level\n      - s6      glu, blood sugar

## Digits dataset

In [38]:
from sklearn.datasets import load_digits
?load_digits

In [39]:
data = load_digits()

In [40]:
# check the shape of dataset
data.data.shape

(1797, 64)

In [41]:
data.feature_names # its 

['pixel_0_0',
 'pixel_0_1',
 'pixel_0_2',
 'pixel_0_3',
 'pixel_0_4',
 'pixel_0_5',
 'pixel_0_6',
 'pixel_0_7',
 'pixel_1_0',
 'pixel_1_1',
 'pixel_1_2',
 'pixel_1_3',
 'pixel_1_4',
 'pixel_1_5',
 'pixel_1_6',
 'pixel_1_7',
 'pixel_2_0',
 'pixel_2_1',
 'pixel_2_2',
 'pixel_2_3',
 'pixel_2_4',
 'pixel_2_5',
 'pixel_2_6',
 'pixel_2_7',
 'pixel_3_0',
 'pixel_3_1',
 'pixel_3_2',
 'pixel_3_3',
 'pixel_3_4',
 'pixel_3_5',
 'pixel_3_6',
 'pixel_3_7',
 'pixel_4_0',
 'pixel_4_1',
 'pixel_4_2',
 'pixel_4_3',
 'pixel_4_4',
 'pixel_4_5',
 'pixel_4_6',
 'pixel_4_7',
 'pixel_5_0',
 'pixel_5_1',
 'pixel_5_2',
 'pixel_5_3',
 'pixel_5_4',
 'pixel_5_5',
 'pixel_5_6',
 'pixel_5_7',
 'pixel_6_0',
 'pixel_6_1',
 'pixel_6_2',
 'pixel_6_3',
 'pixel_6_4',
 'pixel_6_5',
 'pixel_6_6',
 'pixel_6_7',
 'pixel_7_0',
 'pixel_7_1',
 'pixel_7_2',
 'pixel_7_3',
 'pixel_7_4',
 'pixel_7_5',
 'pixel_7_6',
 'pixel_7_7']

In [42]:
data.data[:5] # loading first 5 examples

array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
        15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
        12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
         0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
        10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.],
       [ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,
         9.,  0.,  0.,  0.,  0.,  3., 15., 16.,  6.,  0.,  0.,  0.,  7.,
        15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,  0.,
         0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16.,
        16.,  6.,  0.,  0.,  0.,  0.,  0., 11., 16., 10.,  0.,  0.],
       [ 0.,  0.,  0.,  4., 15., 12.,  0.,  0.,  0.,  0.,  3., 16., 15.,
        14.,  0.,  0.,  0.,  0.,  8., 13.,  8., 16.,  0.,  0.,  0.,  0.,
         1.,  6., 15., 11.,  0.,  0.,  0.,  1.,  8., 13., 15.,  1.,  0.,
         0.,  0.,  9., 16., 16.,  5.,  0.,  0.,  0.,  0.,  

In [44]:
data.target_names # examining the labels

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [46]:
data.target # all the target labels 

array([0, 1, 2, ..., 8, 9, 8])

In [47]:
# first 10

data.target[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## Wine dataset

In [50]:
from sklearn.datasets import load_wine
data = load_wine()

In [52]:
?load_wine

In [53]:
# check the shape

data.data.shape

(178, 13)

In [55]:
# Examine the features

data.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [56]:
# check the first few entries

data.data[:6]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03],
       [1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02, 3.850e+00,
        3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
        1.480e+03],
       [1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02, 2.800e+00,
        2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
        7.350e+02],
       [1.420e+01, 1.760e+00, 2.450e+00, 1.520e+01, 1.120e+02, 3.270e+00,
        3.390e+00, 3.400e-01, 1.970e+00, 6.750e+00, 1.050e+00, 2.850e+00,
        1.45

In [57]:
# examing the target class

data.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [58]:
# print the target classes- 178 in total
data.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [59]:
data.target.shape

(178,)

In [61]:
# other datasets to examine

from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_linnerud

# Fetchers

## fetch_california_housing

In [66]:
# import the library and access the documentation

from sklearn.datasets import fetch_california_housing
?fetch_california_housing


In [67]:
housing_data = fetch_california_housing()

In [68]:
# note the fetch_* also return a bunch object
type(housing_data)

sklearn.utils.Bunch

In [69]:
# Examine the bunch object:

housing_data.DESCR

'.. _california_housing_dataset:\n\nCalifornia Housing dataset\n--------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 20640\n\n    :Number of Attributes: 8 numeric, predictive attributes and the target\n\n    :Attribute Information:\n        - MedInc        median income in block group\n        - HouseAge      median house age in block group\n        - AveRooms      average number of rooms per household\n        - AveBedrms     average number of bedrooms per household\n        - Population    block group population\n        - AveOccup      average number of household members\n        - Latitude      block group latitude\n        - Longitude     block group longitude\n\n    :Missing Attribute Values: None\n\nThis dataset was obtained from the StatLib repository.\nhttps://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html\n\nThe target variable is the median house value for California districts,\nexpressed in hundreds of thousands of dollars ($100,000

In [70]:
# find out the shape of feature matrix

housing_data.data.shape

(20640, 8)

In [71]:
# list of features

housing_data.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [72]:
# look at the first five examples

housing_data.data[:5]

array([[ 8.32520000e+00,  4.10000000e+01,  6.98412698e+00,
         1.02380952e+00,  3.22000000e+02,  2.55555556e+00,
         3.78800000e+01, -1.22230000e+02],
       [ 8.30140000e+00,  2.10000000e+01,  6.23813708e+00,
         9.71880492e-01,  2.40100000e+03,  2.10984183e+00,
         3.78600000e+01, -1.22220000e+02],
       [ 7.25740000e+00,  5.20000000e+01,  8.28813559e+00,
         1.07344633e+00,  4.96000000e+02,  2.80225989e+00,
         3.78500000e+01, -1.22240000e+02],
       [ 5.64310000e+00,  5.20000000e+01,  5.81735160e+00,
         1.07305936e+00,  5.58000000e+02,  2.54794521e+00,
         3.78500000e+01, -1.22250000e+02],
       [ 3.84620000e+00,  5.20000000e+01,  6.28185328e+00,
         1.08108108e+00,  5.65000000e+02,  2.18146718e+00,
         3.78500000e+01, -1.22250000e+02]])

In [74]:
# look at the target attribute- single label
housing_data.target_names

['MedHouseVal']

In [76]:
housing_data.target[:5]

array([4.526, 3.585, 3.521, 3.413, 3.422])

In [77]:
housing_data.target.shape

(20640,)

## fetch_openml

In [80]:
# openml.org is a public repository for ML data and experimentation. that allows everybody to upload open datasets

from sklearn.datasets import fetch_openml
?fetch_openml

In [81]:
# use it to open the MNIST dataset

X, y = fetch_openml('mnist_784', version = 1, return_X_y = True)
print("feture_matrix  shape", X.shape)
print("target_matrix  shape", y.shape)

feture_matrix  shape (70000, 784)
target_matrix  shape (70000,)


## fetch_20newsgroups

In [83]:
from sklearn.datasets import fetch_20newsgroups
?fetch_20newsgroups

In [89]:
fetched = fetch_20newsgroups(subset = 'train')

In [None]:
# The real data lies in the filenames and target attributes. The target attribute is the integer index of the category:

In [91]:
# check the target classes
fetched.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [94]:
fetched.filenames.shape

(11314,)

In [95]:
fetched.target.shape

(11314,)

In [96]:
fetched.target[:5]

array([ 7,  4,  4,  1, 14])

In [97]:
fetched.filenames[:5]

array(['/root/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38242',
       '/root/scikit_learn_data/20news_home/20news-bydate-train/sci.space/60880'],
      dtype='<U86')

## fetch_kddcup99

In [98]:
from sklearn.datasets import fetch_kddcup99
?fetch_kddcup99

In [100]:
X, y = fetch_kddcup99(return_X_y = True)

In [101]:
print("feture_matrix  shape", X.shape)
print("target_matrix  shape", y.shape)

feture_matrix  shape (494021, 41)
target_matrix  shape (494021,)


In [103]:
kdd = fetch_kddcup99()

In [104]:
kdd.feature_names

['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

In [106]:
kdd.target_names

['labels']

In [108]:
kdd.target[:6]

array([b'normal.', b'normal.', b'normal.', b'normal.', b'normal.',
       b'normal.'], dtype=object)

# Generators

## make_regression

In [111]:
from sklearn.datasets import make_regression
?make_regression

In [112]:
# lets generate 100 samples with 5 features for a single label regression problem

X, y = make_regression(n_samples = 100, n_features = 5, n_targets = 1, shuffle = True, random_state = 42)

In [113]:
# examine

X.shape

(100, 5)

In [114]:
y.shape

(100,)

In [116]:
# Example2: generate 100 samples for regression problem with 6 features and 5outputs

X, y = make_regression(n_samples = 100, 
                       n_features = 6, 
                       n_targets = 5,
                       shuffle = True,
                       random_state = 42
)

In [117]:
X.shape

(100, 6)

In [118]:
y.shape

(100, 5)

## make_classification

In [119]:
from sklearn.datasets import make_classification
?make_classification

In [120]:
# create a classification dataset of 100 samples with 10 features and Binary level classification

X, y = make_classification(n_samples=100,
                           n_features = 10,
                           n_classes = 2,
                           n_clusters_per_class = 1,
                           random_state = 42
                           )

In [121]:
X.shape

(100, 10)

In [122]:
y.shape

(100,)

In [123]:
X[:5]

array([[ 0.11422765, -1.71016839, -0.06822216, -0.14928517,  0.30780177,
         0.15030176, -0.05694562, -0.22595246, -0.36361221, -0.13818757],
       [ 0.70775194, -1.57022472, -0.23503183, -0.63604713,  0.62180996,
        -0.56246678,  0.97255445, -0.77719676,  0.63240774, -0.47809669],
       [ 0.63859246,  0.04739867,  0.33273433,  1.1046981 , -0.65183611,
        -1.66152006, -1.2110162 ,  1.09821151, -0.0660798 ,  0.68024225],
       [-0.23894805, -0.97755524,  0.0379061 ,  0.19896733,  0.50091719,
        -0.90756366,  0.75539123,  0.12437227, -0.57677133,  0.07871283],
       [-0.59239392, -0.05023811,  0.17573204, -1.43949185,  0.27045683,
        -0.86399077, -0.83095012,  0.60046915,  0.04852163,  0.32557953]])

In [124]:
y[:5]

array([1, 1, 1, 1, 0])

## make_multilabel_classification

In [125]:
from sklearn.datasets import make_multilabel_classification
?make_multilabel_classification

In [127]:
# Lets generate multilevel classification problem with 100 samples and 10 features and 5 labels and on an average 2 labels per example

X, y = make_multilabel_classification(n_samples = 100,
                                      n_features = 10,
                                      n_classes = 5,
                                      n_labels = 2)

In [128]:
X.shape

(100, 10)

In [129]:
y.shape

(100, 5)

In [130]:
X[:5]

array([[ 7.,  6.,  3.,  3.,  7.,  6.,  6.,  4.,  8.,  2.],
       [ 4.,  1.,  2.,  3.,  4.,  4.,  4.,  5.,  4.,  8.],
       [11.,  5.,  3.,  4.,  2.,  5.,  0.,  4.,  5.,  3.],
       [ 7.,  1.,  0.,  1.,  7.,  7.,  2.,  3.,  9.,  7.],
       [ 5.,  2.,  3.,  5.,  2., 10.,  8.,  1.,  2.,  8.]])

In [131]:
y[:5]

array([[0, 0, 1, 0, 0],
       [0, 1, 1, 0, 1],
       [0, 1, 1, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1]])

## make_blobs

In [133]:
from sklearn.datasets import make_blobs
?make_blobs

In [135]:
# make_blobs enables us to generate random data for clustering

# make data set of 10 samples with 2 features each for clustering
X, y = make_blobs(n_samples = 10,
                  centers = 3,
                  n_features = 2,
                  random_state = 42)

In [136]:
X.shape

(10, 2)

In [137]:
y.shape

(10,)

In [141]:
X

array([[-5.41397842, -7.10588589],
       [-7.42400992, -6.769187  ],
       [ 3.62704772,  2.28741702],
       [-6.81209899, -8.30485778],
       [-2.26723535,  7.10100588],
       [-2.97867201,  9.55684617],
       [-0.92998481,  9.78172086],
       [ 2.914961  ,  1.41088215],
       [ 3.73185476,  0.56086598],
       [-2.97261532,  8.54855637]])

In [139]:
y[:5]

array([2, 2, 1, 2, 0])

In [140]:
y

array([2, 2, 1, 2, 0, 0, 0, 1, 1, 0])