# Classifying Iris Species

#### Meet the Data

In [1]:
from sklearn.datasets import load_iris
iris_dataset = load_iris() # Returns brunch object which is similar to dictionary with keys and values

In [2]:
print("Keys of iris_dataset:\n{}".format(iris_dataset.keys()))

Keys of iris_dataset:
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [3]:
# key DESCR shows a short description of the dataset
print(iris_dataset['DESCR'][:225] + "\n...")

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
...


In [4]:
# target_names contains the value that we want to predict
print("Target Names: {}".format(iris_dataset['target_names']))

Target Names: ['setosa' 'versicolor' 'virginica']


In [5]:
# The value of feature_names is a list of strings, giving the description of each feature
print("Feature Names:\n{}".format(iris_dataset['feature_names']))

Feature Names:
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [6]:
# The data itself is contained in the target and data fields. data contains the numeric 
# measurements of sepal length, sepal width, petal length, and petal width in a NumPy array
print("Type of Data: {}".format(type(iris_dataset['data'])))

Type of Data: <class 'numpy.ndarray'>


In [7]:
# Rows and columns in the data
print("Shape of data: {}".format(iris_dataset['data'].shape))

Shape of data: (150, 4)


In [8]:
# printing the first five columns
# ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
print("First five rows of data:\n{}".format(iris_dataset['data'][:5]))

First five rows of data:
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [9]:
# The target array contains the species of each of the flowers that were measured, also as a NumPy array
# Target Names: ['setosa' --> 0 'versicolor' --> 1'virginica' --> 2]
print("Type of target: {}".format(type(iris_dataset['target'])))
print("Shape of target: {}".format(iris_dataset['target'].shape))
print("Target:\n{}".format(iris_dataset['target'])) # values are in the range of 0 to 2

Type of target: <class 'numpy.ndarray'>
Shape of target: (150,)
Target:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


#### Measuring Success: Training and Testing Data

In [10]:
# By default splits into 75% train 25% test data
# Random sampling
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_dataset['data'], iris_dataset['target'], random_state=42)

In [11]:
print("Shape of X_train: {}".format(X_train.shape))
print("Shape of y_train: {}".format(y_train.shape))

print("Shape of X_test: {}".format(X_test.shape))
print("Shape of y_test: {}".format(y_test.shape))

Shape of X_train: (112, 4)
Shape of y_train: (112,)
Shape of X_test: (38, 4)
Shape of y_test: (38,)


#### First Things First: Look at Your Data