# Iris dataset and Pandas

## Loading data 

In [10]:
# Import for pandas
import pandas as pd

# Loading the dataset from github repo
df = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")

# Outputting df to show the dataset has been read in sucessfully 
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [17]:
# Reading all the rows for the column "sepal_width" and "species"
# the index numbers are auto generated
df[["sepal_width", "species"]]

Unnamed: 0,sepal_width,species
0,3.5,setosa
1,3.0,setosa
2,3.2,setosa
3,3.1,setosa
4,3.6,setosa
...,...,...
145,3.0,virginica
146,2.5,virginica
147,3.0,virginica
148,3.4,virginica


In [21]:
# pandas subscripting to return rows
# this instance returns the fourth row up the seventh row (starts at 0)
df[3:7]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa


In [24]:
# Using both previous sub-scripts at once
# Returns the specified columns and rows
# NOT BEST TO USE
df[["sepal_width", "species"]][3:7]

Unnamed: 0,sepal_width,species
3,3.1,setosa
4,3.6,setosa
5,3.9,setosa
6,3.4,setosa


In [31]:
# Returns information based on the given index or indices to the loc() function
# loc() returns rows and cols based on the index passed
# Returns the row with the index 3 up to and including index 7
# This is not the same as the method used prior which returned rows based on their index, but not the actual index value
df.loc[3:7]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa


In [32]:
# Return every row but only the column "species"
df.loc[:, "species"]

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [35]:
# Returns data based on the position of a row in the dataframe not the value of the index
df.iloc[2]

sepal_length       4.7
sepal_width        3.2
petal_length       1.3
petal_width        0.2
species         setosa
Name: 2, dtype: object

In [43]:
# returns values of specified fields
# what is in row 3 col "species"
df.at[3, "species"]

'setosa'

In [45]:
# return all rows starting at the index 1 up to (not including) 10 in increments of 2
df.iloc[1:10:2]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
7,5.0,3.4,1.5,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


# Boolean Select

In [47]:
# returns true or false based on a comparison with the given string
df.loc[:, "species"] == "setosa"

0       True
1       True
2       True
3       True
4       True
       ...  
145    False
146    False
147    False
148    False
149    False
Name: species, Length: 150, dtype: bool

In [48]:
# returns all the rows that are true for this statement i.e all the versicolours rows
df.loc[df.loc[:,"species"]=="versicolor"]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor
55,5.7,2.8,4.5,1.3,versicolor
56,6.3,3.3,4.7,1.6,versicolor
57,4.9,2.4,3.3,1.0,versicolor
58,6.6,2.9,4.6,1.3,versicolor
59,5.2,2.7,3.9,1.4,versicolor


In [49]:
# assign all "versicolours" to value x
# This creates a subset from the over-all dataframe
x = df.loc[df.loc[:,"species"]=="versicolor"]

In [55]:
# cannot use loc() anymore for index values that are not in the cell above
# 0 is not an index value that is contained in the subset x
# x.loc[0]

In [54]:
# must use iloc() for getting the positions of the rows for this new subset
x.iloc[0] # == x.loc[50]

sepal_length             7
sepal_width            3.2
petal_length           4.7
petal_width            1.4
species         versicolor
Name: 50, dtype: object

# Summary statistics

In [59]:
# returns the first 5 rows
df.head()
# can specify how large of a "head" you want
df.head(25)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [61]:
# returns the last five rows
df.tail()
# can specify how large of a "tail" you want
df.tail(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
140,6.7,3.1,5.6,2.4,virginica
141,6.9,3.1,5.1,2.3,virginica
142,5.8,2.7,5.1,1.9,virginica
143,6.8,3.2,5.9,2.3,virginica
144,6.7,3.3,5.7,2.5,virginica
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [58]:
# gives you some data on the fields in the dataframe
# such as the count, min, max and average 
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [66]:
# returns the same stats for a the subset of the dataframe that is specified
(df.loc[df.loc[:,"species"]=="versicolor"]).describe() # == x.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,50.0,50.0,50.0,50.0
mean,5.936,2.77,4.26,1.326
std,0.516171,0.313798,0.469911,0.197753
min,4.9,2.0,3.0,1.0
25%,5.6,2.525,4.0,1.2
50%,5.9,2.8,4.35,1.3
75%,6.3,3.0,4.6,1.5
max,7.0,3.4,5.1,1.8


In [73]:
# getting stats for setosas
(df.loc[df.loc[:,"species"]=="setosa"]).describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,50.0,50.0,50.0,50.0
mean,5.006,3.418,1.464,0.244
std,0.35249,0.381024,0.173511,0.10721
min,4.3,2.3,1.0,0.1
25%,4.8,3.125,1.4,0.2
50%,5.0,3.4,1.5,0.2
75%,5.2,3.675,1.575,0.3
max,5.8,4.4,1.9,0.6


In [71]:
# getting stats for virginicas
(df.loc[df.loc[:,"species"]=="virginica"]).describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,50.0,50.0,50.0,50.0
mean,6.588,2.974,5.552,2.026
std,0.63588,0.322497,0.551895,0.27465
min,4.9,2.2,4.5,1.4
25%,6.225,2.8,5.1,1.8
50%,6.5,3.0,5.55,2.0
75%,6.9,3.175,5.875,2.3
max,7.9,3.8,6.9,2.5


In [74]:
# returns the average for all columns
df.mean()

sepal_length    5.843333
sepal_width     3.054000
petal_length    3.758667
petal_width     1.198667
dtype: float64

# Plots

In [87]:
# used for plotting information in dataframes to graphs, works well with pandas
import seaborn as sns

In [None]:
sns.pairplot(df, hue="species")