In [1]:
import numpy as np
import pandas as pd
from pydataset import data

#### 1. Copy the code from the lesson to create a dataframe full of student grades.

In [2]:
np.random.seed(123)

students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

# randomly generate scores for each student for each subject
# note that all the values need to have the same length here
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades})

In [3]:
df.head()

Unnamed: 0,name,math,english,reading
0,Sally,62,85,80
1,Jane,88,79,67
2,Suzie,94,74,95
3,Billy,98,96,88
4,Ada,77,92,98


#### 1a. Create a column named passing_english that indicates whether each student has a passing grade in english.

In [4]:
df['passing_english'] = df.math > 70

#### 1b. Sort the english grades by the passing_english column.

In [5]:
df.sort_values(by='passing_english')

Unnamed: 0,name,math,english,reading,passing_english
0,Sally,62,85,80,False
9,Richard,69,80,94,False
1,Jane,88,79,67,True
2,Suzie,94,74,95,True
3,Billy,98,96,88,True
4,Ada,77,92,98,True
5,John,79,76,93,True
6,Thomas,82,64,81,True
7,Marie,93,63,90,True
8,Albert,92,62,87,True


#### 1c. Sort the english grades first by passing_english and then by student name. All the students that are failing english should be first, and within the students that are failing english they should be ordered alphabetically. The same should be true for the students passing english. (Hint: you can pass a list to the .sort_values method)

In [6]:
df.sort_values(by=['passing_english', 'name'])

Unnamed: 0,name,math,english,reading,passing_english
9,Richard,69,80,94,False
0,Sally,62,85,80,False
4,Ada,77,92,98,True
11,Alan,92,62,72,True
8,Albert,92,62,87,True
3,Billy,98,96,88,True
10,Isaac,92,99,93,True
1,Jane,88,79,67,True
5,John,79,76,93,True
7,Marie,93,63,90,True


#### 1d. Sort the english grades first by passing_english, and then by the actual english grade, similar to how we did in the last step.

In [7]:
df.sort_values(by=['passing_english', 'english'])

Unnamed: 0,name,math,english,reading,passing_english
9,Richard,69,80,94,False
0,Sally,62,85,80,False
8,Albert,92,62,87,True
11,Alan,92,62,72,True
7,Marie,93,63,90,True
6,Thomas,82,64,81,True
2,Suzie,94,74,95,True
5,John,79,76,93,True
1,Jane,88,79,67,True
4,Ada,77,92,98,True


#### 1e. Calculate each students overall grade and add it as a column on the dataframe. The overall grade is the average of the math, english, and reading grades.

In [8]:
df['overall_grade'] = df[['math', 'english', 'reading']].mean(axis=1).round(2)

#### 2. Load the mpg dataset. 

In [9]:
mpg = data('mpg')

#### How many rows and columns are there?

In [10]:
mpg.shape

(234, 11)

#### What are the data types of each column?


In [11]:
mpg.dtypes

manufacturer     object
model            object
displ           float64
year              int64
cyl               int64
trans            object
drv              object
cty               int64
hwy               int64
fl               object
class            object
dtype: object

#### Summarize the dataframe with .info and .describe


In [12]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 234 entries, 1 to 234
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  234 non-null    object 
 1   model         234 non-null    object 
 2   displ         234 non-null    float64
 3   year          234 non-null    int64  
 4   cyl           234 non-null    int64  
 5   trans         234 non-null    object 
 6   drv           234 non-null    object 
 7   cty           234 non-null    int64  
 8   hwy           234 non-null    int64  
 9   fl            234 non-null    object 
 10  class         234 non-null    object 
dtypes: float64(1), int64(4), object(6)
memory usage: 21.9+ KB


In [13]:
mpg.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


#### Rename the cty column to city.


In [14]:
mpg = mpg.rename(columns={'cty': 'city'})

#### Rename the hwy column to highway.


In [15]:
mpg = mpg.rename(columns={'hwy': 'highway'})

#### Do any cars have better city mileage than highway mileage?


In [16]:
mpg[mpg.city > mpg.highway]
# returns a df with 0 rows, therefore, there are no cars that meet this criteria

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class


#### Create a column named mileage_difference this column should contain the difference between highway and city mileage for each car.


In [17]:
mpg['mileage_difference'] = mpg.highway - mpg.city 

#### Which car (or cars) has the highest mileage difference?


In [18]:
mpg[mpg.mileage_difference == mpg.mileage_difference.max()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
107,honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact,12
223,volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact,12


#### Which compact class car has the lowest highway mileage?


In [19]:
compacts = mpg[mpg['class'] == 'compact']

In [20]:
compacts[compacts.highway == compacts.highway.min()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
220,volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact,7


#### The best?

In [21]:
compacts[compacts.highway == compacts.highway.max()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference
213,volkswagen,jetta,1.9,1999,4,manual(m5),f,33,44,d,compact,11


#### Create a column named average_mileage that is the mean of the city and highway mileage.


In [22]:
mpg['average_mileage'] = mpg[['city', 'highway']].mean(axis=1)

#### Which dodge car has the best average mileage? The worst?


In [23]:
dodges = mpg[mpg.manufacturer == 'dodge']

In [24]:
dodges[dodges.average_mileage == dodges.average_mileage.max()]

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,city,highway,fl,class,mileage_difference,average_mileage
38,dodge,caravan 2wd,2.4,1999,4,auto(l3),f,18,24,r,minivan,6,21.0


#### Load the Mammals dataset.


In [25]:
mammals = data('Mammals')

####    How many rows and columns are there?


In [26]:
mammals.shape

(107, 4)

#### What are the data types?


In [27]:
mammals.dtypes

weight      float64
speed       float64
hoppers        bool
specials       bool
dtype: object

#### Summarize the dataframe with .info and .describe


In [28]:
mammals.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107 entries, 1 to 107
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   weight    107 non-null    float64
 1   speed     107 non-null    float64
 2   hoppers   107 non-null    bool   
 3   specials  107 non-null    bool   
dtypes: bool(2), float64(2)
memory usage: 2.7 KB


In [29]:
mammals.describe()

Unnamed: 0,weight,speed
count,107.0,107.0
mean,278.688178,46.208411
std,839.608269,26.716778
min,0.016,1.6
25%,1.7,22.5
50%,34.0,48.0
75%,142.5,65.0
max,6000.0,110.0


#### What is the the weight of the fastest animal?


In [30]:
mammals.weight[mammals.speed == mammals.speed.max()]

53    55.0
Name: weight, dtype: float64

#### What is the overall percentage of specials?


In [31]:
f'{round(mammals.specials.mean() * 100, 2)}%'

'9.35%'

#### How many animals are hoppers that are above the median speed? 


In [32]:
mammals[mammals.speed > mammals.speed.median()].hoppers.sum()

7

#### What percentage is this?

In [33]:
total_hoppers = mammals.hoppers.sum()
total_above_median_speed = len(mammals[mammals.speed > mammals.speed.median()])
hoppers_above_median_speed = mammals[mammals.speed > mammals.speed.median()].hoppers.sum()

In [34]:
# percentage (proportion) of hoppers that are above the median speed out of all mammals
hoppers_above_median_speed / len(mammals)

0.06542056074766354

In [35]:
# percentage (proportion) of hoppers that are above the median speed out of all mammals that are hoppers
hoppers_above_median_speed / total_hoppers

0.6363636363636364

In [36]:
# percentage (proportion) of hoppers that are above the median speed out of all mammas that are above the median speed
hoppers_above_median_speed / total_above_median_speed

0.1320754716981132