# Initializing and Reading in Data
------

## 1. Read in the iris dataset

In [1]:
import csv # Remember to import csv before you start
with open('../data/iris.csv','rU') as d:
    data = []
    reader = csv.reader(d)
    for row in reader:
        data.append(row)
print(data[:5]) # Check to see if this looks right

[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'], ['5.1', '3.5', '1.4', '0.2', 'setosa'], ['4.9', '3', '1.4', '0.2', 'setosa'], ['4.7', '3.2', '1.3', '0.2', 'setosa'], ['4.6', '3.1', '1.5', '0.2', 'setosa']]


## 2. Cut out the header row to focus on your data

In [2]:
header = data[:1] # Keep the header row in case you need to refer to it
data = data[1:] # Replace the whole data set with the whole dataset minus header row
# Remember to only do this once or you'll cut away your data one piece at a time
print header # Check-check-check it out to make sure it's there
print data[:5]

[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']]
[['5.1', '3.5', '1.4', '0.2', 'setosa'], ['4.9', '3', '1.4', '0.2', 'setosa'], ['4.7', '3.2', '1.3', '0.2', 'setosa'], ['4.6', '3.1', '1.5', '0.2', 'setosa'], ['5', '3.6', '1.4', '0.2', 'setosa']]


## 3. Identify the total number of observations

In [3]:
total_num_obs = len(data) # Since we cut out header, the # of lists
print(total_num_obs)      # is the number of observations

150


## 4. Species in row four is...

In [4]:
print(header) # Which entry is "species" in again?

[['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']]


Species is at `index = 4` for each list

In [5]:
print(data[3][4]) # Look in the fourth row (index = 3)
                  # and the species observation (index = 4)

setosa


## 5. Average sepal length

I see from Q4 that sepal length is in the `index = 0` element of each row. Nice.

In [6]:
sepal_length = [float(i[0]) for i in data] # Pull out the first index in each
# make sure you float this to convert from strings
print(sepal_length[:5]) # Make sure it's right
print(data[:5])         # by checking each of the data observations

[5.1, 4.9, 4.7, 4.6, 5.0]
[['5.1', '3.5', '1.4', '0.2', 'setosa'], ['4.9', '3', '1.4', '0.2', 'setosa'], ['4.7', '3.2', '1.3', '0.2', 'setosa'], ['4.6', '3.1', '1.5', '0.2', 'setosa'], ['5', '3.6', '1.4', '0.2', 'setosa']]


In [7]:
avg_sepal_length = sum(sepal_length)/(len(sepal_length))
print(avg_sepal_length)

5.84333333333


## 6. Average sepal length of setosa flowers

Here we're going to conduct a similar process as we did for all flowers, but limit to just those in species 'setosa'. I'll use the index to refer to both of those.

To get this we want to pull from the 'sepal length' in `index = 0` for each entry, and from the 'species' column at `index =4`. I'll pull it as a string first, then convert to a float.

In [8]:
setosa_length_str = [i[0] for i in data if i[4]=='setosa'] # Does this count as a loop?
print(setosa_length_str[:5])
print(data[:5]) # I know setosas are listed first, so I printed the head.

['5.1', '4.9', '4.7', '4.6', '5']
[['5.1', '3.5', '1.4', '0.2', 'setosa'], ['4.9', '3', '1.4', '0.2', 'setosa'], ['4.7', '3.2', '1.3', '0.2', 'setosa'], ['4.6', '3.1', '1.5', '0.2', 'setosa'], ['5', '3.6', '1.4', '0.2', 'setosa']]


In [9]:
setosa_length = [float(i) for i in setosa_length_str] # Convert to float
avg_setosa_length = (sum(setosa_length))/(len(setosa_length))
print('Average setosa length:'),
print(avg_setosa_length)

Average setosa length: 5.006


## Average sepal length for all flowers

In [19]:
def avg_sepal(species):
    species_list = [a for a in data if a[4] == species] # Limit data to just species in quest
    sepal_lengths = [float(i[0]) for i in species_list] # Floatify the lengths
    total_sepal = sum(sepal_lengths)
    sepal_avg = total_sepal/len(species_list)
    print species+":", # Printed the species for clarity
    return sepal_avg # Return output is a num, can be used for other things

print(avg_sepal('setosa'))
print(avg_sepal('versicolor'))

setosa: 5.006
versicolor: 5.936


And in case you wondered what the species of flowers are, they are...

In [13]:
flower_types = {} # Creates a dictionary to count how many obs per species
for i in data:
    species = i[4] # Makes this easier to work with below
    if species in flower_types: # Checks to see if it's in dict already
        flower_types[species] += 1 # If it is we tick up the total
    else:
        flower_types[species] = 1 # If not set it at 1
        
print(flower_types)

{'setosa': 50, 'versicolor': 50, 'virginica': 50}


-------
Below, I tried to do this in a complicated way. The *vision* was to create a dictionary that held all the flowers, then call the dictionary key based on what was typed into species. 

I wasn't able to debug this, mostly because it seemed easier to just put it into a more straightforward function that just cut down the list.

In [12]:
def avg_sepal_length(species):
    sepal_length = {}
    flowers = [] # This will hold types of flowers
    for a in data:
        if a[4] not in flowers: # Remember species is at index 4
            flowers += a # Adding to the list of flower types
    for species in flowers:
        sepal_lengths = [float(i[0]) for i in data if i[4]==species]
        sepal_total = sum(sepal_lengths)
        sepal_avg = (sepal_total)/(len(sepal_lengths))
        sepal_length[species] = sepal_avg
    return species,":",sepal_avg

avg_sepal_length('setosa')
# This gave me an error about division by zero
# I think that was from not filling the species in in that loop
# Dictionary was probably too ambitious
# I'm going to simplify

ZeroDivisionError: integer division or modulo by zero