## DataSet - IRIS flowers dataset

This is a python notebook. It is used to have clean code and get instant outputs.
The name says it all: It's a **Notebook**.

In [16]:
# Import statements
# You can also use read_table() method.
# Pandas is used to read and manipulate data in python. It is a tool used in data analysis.

import pandas as pd


In [17]:
# Reading the dataset
# You can also use the URL instead of local file(Iris.csv).
# UCI URL(Connectivity issues) : http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
# Kaggle URL(No issue but download required): https://www.kaggle.com/uciml/iris/data

orders = pd.read_table('Iris.csv', sep=',', header=None)
print (orders.head())

   0    1    2    3    4            5
0  1  5.1  3.5  1.4  0.2  Iris-setosa
1  2  4.9  3.0  1.4  0.2  Iris-setosa
2  3  4.7  3.2  1.3  0.2  Iris-setosa
3  4  4.6  3.1  1.5  0.2  Iris-setosa
4  5  5.0  3.6  1.4  0.2  Iris-setosa


In [18]:
# Creating a list of column names
# Pay attention to names=iris_cols in the next cell.

iris_cols = ['sepal_len', 'sepal_width', 'petal_len', 'petal_width', 'rclass']

In [34]:
# Reading the dataset
# The URL to dataset has connectivity issues so you might
# prefer saving the dataset to your local machine and reference it from there.
# sep = seperator
# header = None simply means that we do not have labels for our data.

orders = pd.read_csv(
    'Iris.csv',
    sep=',',
    names=iris_cols
)
print (orders.head(10))

    sepal_len  sepal_width  petal_len  petal_width       rclass
1         5.1          3.5        1.4          0.2  Iris-setosa
2         4.9          3.0        1.4          0.2  Iris-setosa
3         4.7          3.2        1.3          0.2  Iris-setosa
4         4.6          3.1        1.5          0.2  Iris-setosa
5         5.0          3.6        1.4          0.2  Iris-setosa
6         5.4          3.9        1.7          0.4  Iris-setosa
7         4.6          3.4        1.4          0.3  Iris-setosa
8         5.0          3.4        1.5          0.2  Iris-setosa
9         4.4          2.9        1.4          0.2  Iris-setosa
10        4.9          3.1        1.5          0.1  Iris-setosa


In [20]:
# Printing individual series
print (orders[['sepal_len', 'sepal_width', 'petal_len']].head())

   sepal_len  sepal_width  petal_len
1        5.1          3.5        1.4
2        4.9          3.0        1.4
3        4.7          3.2        1.3
4        4.6          3.1        1.5
5        5.0          3.6        1.4


In [21]:
# Getting the type of the 'orders' variable - DataFrame
print (type(orders))

# Getting the type of the orders['sepal_length'] variable - Series
print (type(orders['sepal_len']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [22]:
# Getting the size of the dataset
# 150 rows and 5 columns
print (orders.shape)

(150, 5)


In [23]:
# Gives the description of the dataset.
orders.describe()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [24]:
# Attribute dtypes returns the type of the variable: float, int, string, etc.
orders.dtypes

sepal_len      float64
sepal_width    float64
petal_len      float64
petal_width    float64
rclass          object
dtype: object

In [25]:
# Counts the number of x values in each y value.
# Counts the number of rows in each column.
orders.count()

sepal_len      150
sepal_width    150
petal_len      150
petal_width    150
rclass         150
dtype: int64

In [26]:
# Creating a new series called sepal_details
orders['sepal_details'] = orders['sepal_width'] + orders['sepal_len']
print (orders.head())

   sepal_len  sepal_width  petal_len  petal_width       rclass  sepal_details
1        5.1          3.5        1.4          0.2  Iris-setosa            8.6
2        4.9          3.0        1.4          0.2  Iris-setosa            7.9
3        4.7          3.2        1.3          0.2  Iris-setosa            7.9
4        4.6          3.1        1.5          0.2  Iris-setosa            7.7
5        5.0          3.6        1.4          0.2  Iris-setosa            8.6


In [27]:
# If petal length is greater than 1.5, store it in the orders_length variable.
orders_length = orders.petal_len > 1.5

In [28]:
# Select all those petal widths where petal_len > 1.5 or sepal_width > 3.4

orders[
    (orders.petal_len>1.5) | (orders.sepal_width>3.4)
    ].petal_width.head()

1     0.2
5     0.2
6     0.4
11    0.2
12    0.2
Name: petal_width, dtype: float64

In [29]:
# Renaming the column names
orders.rename(columns={
    'sepal_len':'sepal_length',
    'petal_len': 'petal_length'
}).head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,rclass,sepal_details
1,5.1,3.5,1.4,0.2,Iris-setosa,8.6
2,4.9,3.0,1.4,0.2,Iris-setosa,7.9
3,4.7,3.2,1.3,0.2,Iris-setosa,7.9
4,4.6,3.1,1.5,0.2,Iris-setosa,7.7
5,5.0,3.6,1.4,0.2,Iris-setosa,8.6


In [30]:
orders.petal_width.mean()

1.1986666666666668

In [31]:
orders.groupby('rclass').petal_width.mean()

rclass
Iris-setosa        0.244
Iris-versicolor    1.326
Iris-virginica     2.026
Name: petal_width, dtype: float64