# DataFrames exercises

In [1]:
import pandas as pd
from pydataset import data
import numpy as np

When the instructions say to load a dataset, you can pass the name of the dataset as a string to the data function to load the dataset. You can also view the documentation for the data set by passing the show_doc keyword argument.

```python
# data('mpg', show_doc=True) # view the documentation for the dataset
mpg = data('mpg') # load the dataset and store it in a variable
```

All the datasets loaded from the pydataset library will be pandas dataframes.

## 1.Copy the code from the lesson to create a dataframe full of student grades.

In [2]:
np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

type(df)

pandas.core.frame.DataFrame

### 1a. Create a column named passing_english that indicates whether each student has a passing grade in english.

In [3]:
# METHOD ONE: USING df['column_name'] = value
df['passing_english'] = df.english >= 70 
df
# This method alters the original dataframe

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
9,Richard,69,80,94,True


In [4]:
# METHOD TWO: USING df.assign(column_name=value)
df.assign(passing_reading = df.reading >= 70)
# This method creates a new object

Unnamed: 0,name,math,english,reading,passing_english,passing_reading
0,Sally,62,85,80,True,True
1,Jane,88,79,67,True,False
2,Suzie,94,74,95,True,True
3,Billy,98,96,88,True,True
4,Ada,77,92,98,True,True
5,John,79,76,93,True,True
6,Thomas,82,64,81,False,True
7,Marie,93,63,90,False,True
8,Albert,92,62,87,False,True
9,Richard,69,80,94,True,True


In [5]:
df # The additional column for passing_reading is not there when the original dataframe is called

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,True
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,False
7,Marie,93,63,90,False
8,Albert,92,62,87,False
9,Richard,69,80,94,True


### 1b. Sort the english grades by the passing_english column. How are duplicates handled?

In [6]:
print(df.sort_values(by='passing_english'))
# Duplicates are ordered by their index number in ascending order
# We have not changed the original dataframe, if we wanted to permanently alter the order
# We would use inplace=True (ex: df.sort_values(by='passing_english', inplace=True))

       name  math  english  reading  passing_english
6    Thomas    82       64       81            False
7     Marie    93       63       90            False
8    Albert    92       62       87            False
11     Alan    92       62       72            False
0     Sally    62       85       80             True
1      Jane    88       79       67             True
2     Suzie    94       74       95             True
3     Billy    98       96       88             True
4       Ada    77       92       98             True
5      John    79       76       93             True
9   Richard    69       80       94             True
10    Isaac    92       99       93             True


### 1c. Sort the english grades first by passing_english and then by student name. 
All the students that are failing english should be first, and within the students that are failing english they should be ordered alphabetically. The same should be true for the students passing english. (Hint: you can pass a list to the .sort_values method)

In [8]:
print(df.sort_values(by=['passing_english', 'name'])) #We can pass multiple arguments into sort_values

       name  math  english  reading  passing_english
11     Alan    92       62       72            False
8    Albert    92       62       87            False
7     Marie    93       63       90            False
6    Thomas    82       64       81            False
4       Ada    77       92       98             True
3     Billy    98       96       88             True
10    Isaac    92       99       93             True
1      Jane    88       79       67             True
5      John    79       76       93             True
9   Richard    69       80       94             True
0     Sally    62       85       80             True
2     Suzie    94       74       95             True


### 1d. Sort the english grades first by passing_english, and then by the actual english grade, similar to how we did in the last step.

In [9]:
print(df.sort_values(by=['passing_english', 'english'])) #Numbers are sorted in ascending order by default
# We can sort in descending order by adding ascending=False
# ex: df.sort_values(by['passing_english', 'english'], ascending=False)

       name  math  english  reading  passing_english
8    Albert    92       62       87            False
11     Alan    92       62       72            False
7     Marie    93       63       90            False
6    Thomas    82       64       81            False
2     Suzie    94       74       95             True
5      John    79       76       93             True
1      Jane    88       79       67             True
9   Richard    69       80       94             True
0     Sally    62       85       80             True
4       Ada    77       92       98             True
3     Billy    98       96       88             True
10    Isaac    92       99       93             True


### 1e. Calculate each students overall grade and add it as a column on the dataframe. The overall grade is the average of the math, english, and reading grades.

In [10]:
df['overall_grade'] = (df['math'] + df['english'] + df['reading']) / 3 
# The value assigned to a column can be based off the values contained in other columns
df

Unnamed: 0,name,math,english,reading,passing_english,overall_grade
0,Sally,62,85,80,True,75.666667
1,Jane,88,79,67,True,78.0
2,Suzie,94,74,95,True,87.666667
3,Billy,98,96,88,True,94.0
4,Ada,77,92,98,True,89.0
5,John,79,76,93,True,82.666667
6,Thomas,82,64,81,False,75.666667
7,Marie,93,63,90,False,82.0
8,Albert,92,62,87,False,80.333333
9,Richard,69,80,94,True,81.0


## 2. Load the mpg dataset. Read the documentation for the dataset and use it for the following questions:

In [11]:
mpg = data('mpg') # load the dataset and store it in a variable

In [12]:
data('mpg', show_doc=True) # view the documentation for the dataset

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




### 2a. How many rows and columns are there?

In [15]:
mpg
#Printing the dataframe automatically prints the number of rows and columns

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [16]:
mpg.shape
# We can also investigate the shape of the dataframe
# The .shape method will print a tuple (rows, columns)

(234, 11)

In [18]:
len(mpg.columns)
# We can find the columns by using the .columns method to create a list of column headers
# And then using the len() function to count the elements in that list

11

In [19]:
mpg.index
#The .index method will show us a list of indices for the datafram as well as its length

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            225, 226, 227, 228, 229, 230, 231, 232, 233, 234],
           dtype='int64', length=234)

In [20]:
len(mpg.index)
# We can also confirm the length using the len() function on the list of indices

234

### 2b. What are the data types of each column?

In [21]:
mpg.dtypes 
# This returns a Series with the data type of each column. 
# The result’s index is the original DataFrame’s columns. 
# Columns with mixed types are stored with the object dtype.

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object

### 2c. Summarize the dataframe with .info and .describe

In [23]:
mpg.info()
# Of particular value here is the non-null count. 
# We can use this to determine if there are null values that need to be cleaned up
# By default, not-null count is shown only if the DataFrame is smaller than pandas.options.display.max_info_rows and pandas.options.display.max_info_columns. 
# A value of True always shows the counts, and False never shows the counts.
# ex: mpg.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  234 non-null    object 
 1   model         234 non-null    object 
 2   displ         234 non-null    float64
 3   year          234 non-null    int64  
 4   cyl           234 non-null    int64  
 5   trans         234 non-null    object 
 6   drv           234 non-null    object 
 7   cty           234 non-null    int64  
 8   hwy           234 non-null    int64  
 9   fl            234 non-null    object 
 10  class         234 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB


In [24]:
mpg.describe() # Nifty little table with output for columns of numeric dtype

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


### 2d. Rename the cty column to city.

In [25]:
#METHOD ONE: Using .rename method allows us to rename a single or multiple designated columns
mpg.rename(columns={'cty': 'city'}, inplace = True)
#multiple columns ex: mpg.rename(columns={'cty': 'city', 'hwy': 'highway'}, inplace = True)
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


### 2e. Rename the hwy column to highway.

In [28]:
#METHOD TWO: renaming all of the columns in the table and changing the one that you prefer
mpg.columns =['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'city', 'highway', 'fl', 'class']
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


### 2f. Do any cars have better city mileage than highway mileage?

In [30]:
#We can create a series of booleans that have checked each element
mpg['city'] > mpg['highway']

1      False
2      False
3      False
4      False
5      False
       ...  
230    False
231    False
232    False
233    False
234    False
Length: 234, dtype: bool

In [29]:
#Then we can pass that boolean mask as an index for the dataframe
mpg[mpg['city'] > mpg['highway']]
#There are no rows shown, meaning that the entire dataset returned False

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class


### 2g. Create a column named mileage_difference this column should contain the difference between highway and city mileage for each car.

In [31]:
mpg = mpg.assign(mileage_difference=(mpg['highway'] - mpg['city']))
mpg.head()
#If we use the .assign method, we need to set the results equal to a variable to store our output
#We can reuse mpg this way to permanently change our dataset

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10


### 2h. Which car (or cars) has the highest mileage difference?

In [32]:
#METHOD ONE: Using .max() and boolean masks 
#Lets find the maximum value for mileage_difference
mpg['mileage_difference'].max()

12

In [33]:
#We can then create a boolean mask that returns true if a car has the highest mileage difference
#This can be useful if there is a tie
mpg['mileage_difference'] == 12

1      False
2      False
3      False
4      False
5      False
       ...  
230    False
231    False
232    False
233    False
234    False
Name: mileage_difference, Length: 234, dtype: bool

In [34]:
#We can pass that boolean mask as an index to return a filtered output
mpg[mpg['mileage_difference'] == 12]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


In [35]:
#METHOD TWO: Using nlargest()
mpg['mileage_difference'].nlargest(1, keep='all')
#Similar to .max() we've identified the highest value, and we can also see all indices that match that value
#We accomplished this by choosing n=1 in .nlargest (returns top value)
#and by using keep='all', which will return all additional values that duplicate the top value

107    12
223    12
Name: mileage_difference, dtype: int64

In [40]:
mpg.loc[[107, 223]]
# We can use .loc to hard code the indices that our previous code revealed

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


In [42]:
#or if we don't want to hard code those values, we can pass in a list of indices
mpg['mileage_difference'].nlargest(1, keep='all').index

Int64Index([107, 223], dtype='int64')

In [43]:
mpg.loc[mpg['mileage_difference'].nlargest(1, keep='all').index]
# We could have assigned mpg['mileage_difference'].nlargest(1, keep='all').index 
# to a variable and passed in that variable to mpg.loc[] as well 

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


### 2i. Which compact class car has the lowest highway mileage? The best?

In [44]:
#We can create a boolean mask that filters for compact cars
mpg['class'] == 'compact'

1       True
2       True
3       True
4       True
5       True
       ...  
230    False
231    False
232    False
233    False
234    False
Name: class, Length: 234, dtype: bool

In [45]:
#We can pass that boolean mask into the index for our dataframe to get only the rows with compact cars
mpg[mpg['class'] == 'compact']
#This dataset is small enough to see the entire output, but if the dataset is too large
#We can use .head() to limit the output

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10
6,audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact,8
7,audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact,9
8,audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact,8
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact,9
10,audi,a4 quattro,2.0,2008,4,manual(m6),4,20,28,p,compact,8


In [46]:
#Adding ['highway'] on the end of our compact car dataframe limits the output to only 
#The single 'highway' column
mpg[mpg['class'] == 'compact']['highway']

1      29
2      29
3      31
4      30
5      26
6      26
7      27
8      26
9      25
10     28
11     27
12     25
13     25
14     25
15     25
142    29
143    27
170    25
171    27
172    25
173    27
187    27
188    29
189    31
190    31
191    26
192    26
193    27
194    30
195    33
196    35
197    37
198    35
208    29
209    26
210    29
211    29
212    24
213    44
214    29
215    26
216    29
217    29
218    29
219    29
220    23
221    24
Name: highway, dtype: int64

In [47]:
#We can then use .min() on this series to find the smallest value
lowest_highway_compact = mpg[mpg['class'] == 'compact']['highway'].min()
lowest_highway_compact

23

In [48]:
#.min() only returns one value, so we don't know if there are ties using this method
#We could use .nsmallest(1, keep='all') to check for ties, but we can still find ties
#with what we have so far
#Lets make our code easier to write by saving our dataframe of only compact cars to a 
#variable called mpg_compact
mpg_compact = mpg[mpg['class'] == 'compact']
mpg_compact

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact,11
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact,8
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact,11
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact,9
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact,10
6,audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact,8
7,audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact,9
8,audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact,8
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact,9
10,audi,a4 quattro,2.0,2008,4,manual(m6),4,20,28,p,compact,8


In [49]:
#Then we can create a boolean mask searching for rows where the highway value
# is equal to the lowest highway value we discovered with .min()
mpg_compact['highway'] == lowest_highway_compact

1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
142    False
143    False
170    False
171    False
172    False
173    False
187    False
188    False
189    False
190    False
191    False
192    False
193    False
194    False
195    False
196    False
197    False
198    False
208    False
209    False
210    False
211    False
212    False
213    False
214    False
215    False
216    False
217    False
218    False
219    False
220     True
221    False
Name: highway, dtype: bool

In [50]:
#We can apply that boolean mask to our compact car dataframe to find the compact cars
#with the lowest highway values
#WARNING: Make sure that you keep track of which dataframe you are referring to in each step
mpg_compact[mpg_compact['highway'] == lowest_highway_compact]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
220,volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact,7


In [60]:
#For example, if we accidentally passed out boolean mask to the entire dataframe mpg
#We would create an error because the boolean mask length doesnt match the index length
try:
    mpg[mpg_compact['highway'] == lowest_highway_compact]
except TypeError:
    print('Your dataframe index and your boolean mask are not the same length!')

  after removing the cwd from sys.path.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

### 2j. Create a column named average_mileage that is the mean of the city and highway mileage.

In [None]:
mpg['average_mileage'] = (mpg['city'] + mpg['highway']) / 2
mpg

### 2k. Which dodge car has the best average mileage? The worst?

In [None]:
mpg.head()

In [None]:
mpg_dodge = mpg[mpg['manufacturer'] == 'dodge']
mpg_dodge.head()

In [None]:
max_avg_mileage_mpg_dodge = mpg_dodge['average_mileage'].max()
max_avg_mileage_mpg_dodge

In [None]:
mpg_dodge['average_mileage'] == max_avg_mileage_mpg_dodge

In [None]:
mpg_dodge[mpg_dodge['average_mileage'] == max_avg_mileage_mpg_dodge]

In [None]:
min_avg_mileage_mpg_dodge = mpg_dodge['average_mileage'].min()
min_avg_mileage_mpg_dodge

In [None]:
mpg_dodge['average_mileage'] == min_avg_mileage_mpg_dodge

In [None]:
mpg_dodge[mpg_dodge['average_mileage'] == min_avg_mileage_mpg_dodge]

## 3. Load the Mammals dataset. Read the documentation for it, and use the data to answer these questions

In [None]:
Mammals = data('Mammals') # load the dataset and store it in a variable

In [None]:
data('Mammals', show_doc=True) # view the documentation for the dataset

In [None]:
Mammals.head()

### 3a. How many rows and columns are there?

In [None]:
Mammals.shape #107 rows 4 columns with default index

### 3b. What are the data types?

In [None]:
Mammals.dtypes

### 3c. Summarize the dataframe with .info and .describe

In [None]:
Mammals.info()

In [None]:
Mammals.describe()

### 3d. What is the the weight of the fastest animal?

In [None]:
Mammals['speed'].max()

In [None]:
Mammals['speed'] == Mammals['speed'].max()

In [None]:
Mammals[Mammals['speed'] == Mammals['speed'].max()]

In [None]:
Mammals[Mammals['speed'] == Mammals['speed'].max()]['weight']

In [None]:
Mammals[Mammals['speed'] == Mammals['speed'].max()]['weight'].iloc[0]

In [None]:
print(f"The weight of the fastest animal is {Mammals[Mammals['speed'] == Mammals['speed'].max()]['weight'].iloc[0]}.")

### 3e. What is the overall percentage of specials?

In [None]:
Mammals['specials'].count()

In [None]:
Mammals['specials'] == True

In [None]:
Mammals[Mammals['specials'] == True]['specials'].count()

In [None]:
percentage = (Mammals[Mammals['specials'] == True]['specials'].count()) / Mammals['specials'].count() * 100
percentage = round

### 3f. How many animals are hoppers that are above the median speed?

Question is ambigious. Median speed may be referring to median speed of the all animals in the dataframe or just the median speed hoppers in the dataframe. We will interpret the question to be referring to the median speed of all animals. 

In [None]:
Mammals['speed'].median()

In [None]:
Mammals['speed'] > Mammals['speed'].median()

In [None]:
Mammals_speed_above_median = Mammals[Mammals['speed'] > Mammals['speed'].median()]
Mammals_speed_above_median

In [None]:
Mammals_speed_above_median['hoppers'] == True

In [None]:
Mammals_speed_above_median[Mammals_speed_above_median['hoppers'] == True]

In [None]:
Mammals_speed_above_median[Mammals_speed_above_median['hoppers'] == True].index

In [None]:
number_hoppers_speed_above_median = len(Mammals_speed_above_median[Mammals_speed_above_median['hoppers'] == True].index)
number_hoppers_speed_above_median

### What percentage is this?

Question is ambiguous. Can be interpreted in several ways:
1. Within all animals, what percentage are hoppers that have a speed above the median
2. Within hoppers, what percentage have a speed above the median

Furthermore, which median are we referring to? The median of all animals or just hoppers?

We will interpret the question as follows:
Within all animals, what percentage are hoppers that have a speed above the median speed of all animals. 

In [None]:
Mammals.index

In [None]:
len(Mammals.index)

In [None]:
percentage_hoppers_above_median_speed = number_hoppers_speed_above_median / len(Mammals.index) * 100
percentage_hoppers_above_median_speed

# END