# Iris dataset data analysis

**Author:** Alex Carneiro

**Course:** Moving2DS - Part 1

In [None]:
# importing libraries

import pandas as pd
import numpy as np

from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
%matplotlib inline

## Data reading and checking

In [None]:
dataset = pd.read_csv("../data/iris.data", header=None)
dataset.columns = ["sepal_l", "sepal_w",
                   "petal_l", "petal_w",
                   "class"]

print("Read %d samples from the dataset"%len(dataset))
dataset.sample(5)

In [None]:
print(dataset['class'].describe())

In [None]:
for c in dataset['class'].unique():
    print("Class", c, "has", (dataset['class'] == c).sum(), "samples")

In [None]:
columns = ["sepal_l", "sepal_w",
           "petal_l", "petal_w",
           "class"]

for c in columns:
    print("Data type of column", c, "is", dataset[c].dtype)

In [None]:
any_null = []

for col in columns:
    any_null.append(any(dataset[col].isnull()))

if any(any_null):
    print("Those columns have NULL values:",
          [c for c, n in zip(columns, any_null) if n is True])
else:
    print("There are no NULL values")

## Data visualization

In [None]:
# sorted names of the classes as list of strings
cls = sorted(list(dataset['class'].unique()))
# lambda function that maps the strings to int values
convert = lambda i: cls.index(i)

# separate each dataset column into a variable
data_sepal_l = dataset['sepal_l'].values
data_sepal_w = dataset['sepal_w'].values
data_petal_l = dataset['petal_l'].values
data_petal_w = dataset['petal_w'].values
data_class = dataset['class'].values
data_class_int = dataset['class'].map(convert).values

cmap = plt.cm.get_cmap('Set1')
legend_elements = [Line2D([0], [0],
                          marker='o',
                          color=cmap(i/(len(cls)-1)),
                          label=c) for i, c in enumerate(cls)]

plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
plt.xlabel("sepal_l")
plt.ylabel("sepal_w")
plt.scatter(data_sepal_l, data_sepal_w, c=data_class_int, cmap=cmap);
plt.legend(handles=legend_elements, loc=1);

plt.subplot(2,3,2)
plt.xlabel("sepal_l")
plt.ylabel("petal_l")
plt.scatter(data_sepal_l, data_petal_l, c=data_class_int, cmap=cmap);
plt.legend(handles=legend_elements, loc=2);

plt.subplot(2,3,3)
plt.xlabel("sepal_l")
plt.ylabel("petal_w")
plt.scatter(data_sepal_l, data_petal_w, c=data_class_int, cmap=cmap);
plt.legend(handles=legend_elements, loc=2);

plt.subplot(2,3,4)
plt.xlabel("sepal_w")
plt.ylabel("petal_l")
plt.scatter(data_sepal_w, data_petal_l, c=data_class_int, cmap=cmap);
plt.legend(handles=legend_elements, loc=5);

plt.subplot(2,3,5)
plt.xlabel("sepal_w")
plt.ylabel("petal_w")
plt.scatter(data_sepal_w, data_petal_w, c=data_class_int, cmap=cmap);
plt.legend(handles=legend_elements, loc=5);

plt.subplot(2,3,6)
plt.xlabel("petal_l")
plt.ylabel("petal_w")
plt.scatter(data_petal_l, data_petal_w, c=data_class_int, cmap=cmap);
plt.legend(handles=legend_elements, loc=2);
plt.tight_layout();

## Some statistics

In [None]:
dataset_setosa = dataset[dataset['class'] == 'Iris-setosa']
dataset_versicolor = dataset[dataset['class'] == 'Iris-versicolor']
dataset_virginica = dataset[dataset['class'] == 'Iris-virginica']

In [None]:
columns = ["sepal_l", "sepal_w", "petal_l", "petal_w"]

print("Mean and variance for class 'Iris-setosa'")
for c in columns:
    mean = np.mean(dataset_setosa[c])
    var = np.var(dataset_setosa[c], ddof=1)
    
    print("Column:", c)
    print("Mean value =", mean)
    print("Variance value =", var)

print()
print("Mean and variance for class 'Iris-versicolor'")
for c in columns:
    mean = np.mean(dataset_versicolor[c])
    var = np.var(dataset_versicolor[c], ddof=1)
    
    print("Column:", c)
    print("Mean value =", mean)
    print("Variance value =", var)
    
print()
print("Mean and variance for class 'Iris-virginica'")
for c in columns:
    mean = np.mean(dataset_virginica[c])
    var = np.var(dataset_virginica[c], ddof=1)
    
    print("Column:", c)
    print("Mean value =", mean)
    print("Variance value =", var)

In [None]:
print(dataset_setosa.describe())

In [None]:
print(dataset_versicolor.describe())

In [None]:
print(dataset_virginica.describe())