In [2]:
# first importing the following libraries
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
# save link to data and reference the link 
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

# the data read in does not have any column names. 
# Specify header = None to avoid reading the first row of data as a header or column name

iris = pd.read_csv(csv_url, header = None)

#iris = pd.read_csv('iris_data.csv', header =  None)

# using the attribute information as the column names
col_names = ['Sepal_Length_cm','Sepal_Width_cm','Petal_Length_cm','Petal_Width_cm','Class']

iris =  pd.read_csv(csv_url, names = col_names)

In [4]:
# look at the top 5 observations
iris.head()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# look at the bottom 5 observations
iris.tail()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [33]:
# How many rows in the iris DataFrame
len(iris)

150

In [35]:
# the shape or dimensions of the dataset
iris.shape

(150, 5)

The Data Frame has 5 columns, with the first 4 being the attributes or features of the data set. 
The last column is the class or type of iris plant each observation belongs to.
Each row correspond to an individual observation of an iris plant


In [37]:
species_type = iris['Class'].unique()
species_type

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [6]:
#The DataFrame has an index which was automatically assigned when the DataFrame was created on reading in the csv file. 
#The index is a range from 0 to 150
iris.index

RangeIndex(start=0, stop=150, step=1)

In [7]:
# column names of the data
iris.columns

Index(['Sepal_Length_cm', 'Sepal_Width_cm', 'Petal_Length_cm',
       'Petal_Width_cm', 'Class'],
      dtype='object')

In [8]:
iris.head()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


##  Indexing and Filtering data

making a separate dataframe for each class or species.
This might make it easier for plotting and getting statistics.
Trying out the methods and functions from the pandas.pydate website.
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-and-selecting-data
I want to select all the columns with only rows that belong to one class or species of the iris plant.
You may access an index on a Series, column on a DataFrame, and an item on a Panel directly as an attribute:

In [9]:
# this gets a column of data based on attribute.
iris.Sepal_Length_cm.head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: Sepal_Length_cm, dtype: float64

In [13]:
# this gets the row of data corresponding to index 0 (the first row)
iris.iloc[[0]]

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa


In [14]:
# can get a slice of data using slicing inside of []
#first 5 rows of data
iris[0:5]

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [15]:
# slice using label. (using the loc attribute)
iris.loc[0:5] # the index is labelled o to 150

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa


In [19]:
# getting values with a boolean array. This is used for checking a condition in a row is met
iris.loc[0:5] > 5.1

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,False,False,True
3,False,False,False,False,True
4,False,False,False,False,True
5,True,False,False,False,True


In [25]:
species_type =iris['Class'].unique()
species_type

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

You may select rows from a DataFrame using a boolean vector the same length as the DataFrame’s index (for example, something derived from one of the columns of the DataFrame)

In [49]:
# select from the iris DataFrame only the rows where the Class equals the string "Iris-setosa"
iris[iris['Class'] == "Iris-setosa"].head()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
#http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#boolean-indexing

In [21]:
# select from the iris DataFrame only the rows where the Class equals the string "Iris-setosa"
# save to a new DataFrame
iris_setosa = iris[iris['Class'] == "Iris-setosa"]

iris_setosa.head()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [27]:
# how many setosas is there.
iris_setosa.count()

Sepal_Length_cm    50
Sepal_Width_cm     50
Petal_Length_cm    50
Petal_Width_cm     50
Class              50
dtype: int64

In [28]:
# a quick summary statistics for the Setosa variation only
iris_setosa.describe()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm
count,50.0,50.0,50.0,50.0
mean,5.006,3.418,1.464,0.244
std,0.35249,0.381024,0.173511,0.10721
min,4.3,2.3,1.0,0.1
25%,4.8,3.125,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.675,1.575,0.3
max,5.8,4.4,1.9,0.6


In [29]:
# select from the iris DataFrame only the rows where the Class equals the string "Iris-setosa"
# save to a new DataFrame
iris_versicolor = iris[iris['Class'] == "Iris-versicolor"]
iris_versicolor.head()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
50,7.0,3.2,4.7,1.4,Iris-versicolor
51,6.4,3.2,4.5,1.5,Iris-versicolor
52,6.9,3.1,4.9,1.5,Iris-versicolor
53,5.5,2.3,4.0,1.3,Iris-versicolor
54,6.5,2.8,4.6,1.5,Iris-versicolor


In [30]:
iris_versicolor.describe()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm
count,50.0,50.0,50.0,50.0
mean,5.936,2.77,4.26,1.326
std,0.516171,0.313798,0.469911,0.197753
min,4.9,2.0,3.0,1.0
25%,5.6,2.525,4.0,1.2
50%,5.9,2.8,4.35,1.3
75%,6.3,3.0,4.6,1.5
max,7.0,3.4,5.1,1.8


In [32]:
# select from the iris DataFrame only the rows where the Class equals the string "Iris-virginica"
# save to a new DataFrame
iris_virginica = iris[iris['Class'] == "Iris-virginica"]
iris_virginica.head()

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
100,6.3,3.3,6.0,2.5,Iris-virginica
101,5.8,2.7,5.1,1.9,Iris-virginica
102,7.1,3.0,5.9,2.1,Iris-virginica
103,6.3,2.9,5.6,1.8,Iris-virginica
104,6.5,3.0,5.8,2.2,Iris-virginica


DataFrame also has an isin() method. When calling isin, pass a set of values as either an array or dict. If values is an array, isin returns a DataFrame of booleans that is the same shape as the original DataFrame, with True wherever the element is in the sequence of values.

Oftentimes you’ll want to match certain values with certain columns. Just make values a dict where the key is the column, and the value is a list of items you want to check for.

Combine DataFrame’s `isin` with the `any()` and `all()` methods to quickly select subsets of your data that meet a given criteria. To select a row where each column meets its own criterion:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing-with-isin

In [58]:
# Here subsetting the data to meet a given criteria
values =  {'Class': ['Iris-versicolor', 'Iris-virginica']}
row_mask = iris.isin(values).any(1)
iris[row_mask]

Unnamed: 0,Sepal_Length_cm,Sepal_Width_cm,Petal_Length_cm,Petal_Width_cm,Class
50,7.0,3.2,4.7,1.4,Iris-versicolor
51,6.4,3.2,4.5,1.5,Iris-versicolor
52,6.9,3.1,4.9,1.5,Iris-versicolor
53,5.5,2.3,4.0,1.3,Iris-versicolor
54,6.5,2.8,4.6,1.5,Iris-versicolor
55,5.7,2.8,4.5,1.3,Iris-versicolor
56,6.3,3.3,4.7,1.6,Iris-versicolor
57,4.9,2.4,3.3,1.0,Iris-versicolor
58,6.6,2.9,4.6,1.3,Iris-versicolor
59,5.2,2.7,3.9,1.4,Iris-versicolor


In [None]:

## Missing Values

## check for any missing values using pandas.isnull() or the opposite using pandas.notnull()
print(pd.isnull(iris).sum())

print(pd.notnull(iris).sum())


# by default index.col is set to a range from 0 to the number of rows. 
# This is fine for here. I would prefer to have the row number starting from 1 and the last observation 150

# can write the DataFrame to a comma separated file to save any changes including column names added


iris.to_csv('iris_data.csv')

# Detecting and Filtering outliers

# can see the sumamry statistics of the dataset using the pandas.describe() function.
# can then look at observations that have values exceeding soem statistic values using boolean.

# basic descriptive statistics for each column of the data in the Iris DataFrame
print(iris.describe())

#  how many rows in the iris DataFrame?
print(len(iris))

# The shape of the dataset
print(iris.shape)

print(iris.columns)
# can retrieve a column of data from the iris DataFrame using dict-like notation
print(iris['Sepal_Length_cm'].head())

# can find how many different plants of each class or species using unique

species_type =iris['Class'].unique()
print("The following are the three class or species types of iris in the data set \n",*species_type, sep = " ")
# unpack the list and print the sequence without brackets. (https://stackoverflow.com/a/35119046)

# separate the different classes into different dataFrames.

iris_setosa = iris[iris.Class =='Iris-setosa']
print(iris_setosa)

# Selecting and Filtering 

# can index using the square brackets and this will return a Series corresponding to the column name.
# can retrieve a column of data from the iris DataFrame using dict-like notation or by attribute:
print(iris.Petal_Width_cm.head())

# rows of the iris DataFrame can be retrieved by position name or using the loc attribute.
# The index operators can be used to select a subset or rows and columns.
# loc for axis labels or iloc for integers
# retrieve first observation retrieved as a DataFrame
# The index for the iris DataFrame at the moment is just a range of integers from 0 to 150 
iris.loc[[0]] 
# retrieve as a Series
iris.loc[0]

# I could add a new column by assigning a column that doesn't exist already.
# I might add a new column of labels from 1 to 150 and maybe combined with the species type.
# I dont like the index starting from 0 up to 149! 

# can index into the dataframe to retrieve one or more columns either with a single value or a sequence
print(iris[0:5])

# Boolean Indexing
# can use Boolean operators to select rows that meet certain conditions.

iris[iris.Sepal_Length_cm > 7]

iris.iloc[:,4].head()

iris_setosa = iris[Class =="Iris-setosa"]
# can sort the DataFrame by one or more of the columns.
# put the columns in the order to sort by
iris.sort_values(by =['Petal_Width_cm','Class'])
iris.sort_values(by =['Class','Petal_Length_cm'])
iris.sort_values(by =['Class','Sepal_Length_cm'])
iris.sort_values(by =['Class','Sepal_Width_cm'])

iris.sort_values(by =['Class','Petal_Length_cm'])


## Summary statistics

# Can look at summary statistics for the overall data set.
iris.describe()

# can use boolean indexing to look at the data.
# maybe look at 

# sl_mean =iris.Sepal_Length_cm.mean()
# print(sl_mean)
# print(iris[iris.Sepal_Length_cm] > sl_mean)

iris2 = iris.copy()
print(iris2.describe())


iris.cov()
iris.corr()


## Data Cleaning and Transformation


# the duplicated method can be used to return a boolean series indicating 
# whether each row is a duplicate of another row or not.

#plt.figure()

# Visualising the data set

# can 
# plt.figure()
# iris.plot.hist(bins =50)
# plt.suptitle('iris histograms',fontsize = 14)
# plt.show()

# plt.figure()
# plt.suptitle('iris histograms of measurements',fontsize = 14)
# iris['Class'].plot.hist(bins = 30)
# plt.show()

# plt.scatter(