In [11]:
import pandas as pd
import numpy as np
from pydataset import data

In [2]:
users = pd.DataFrame({
    'id': [1,2,3,4,5,6],
    'name': ['Bob', 'Joe', 'Adam', 'Sally', 'Jorge', 'Mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]

})
users

Unnamed: 0,id,name,role_id
0,1,Bob,1.0
1,2,Joe,2.0
2,3,Adam,3.0
3,4,Sally,3.0
4,5,Jorge,
5,6,Mike,


In [3]:
roles = pd.DataFrame({
    'id': [1,2,3,4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
    
})
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


In [4]:
users.merge(roles, 
            left_on = 'role_id', 
            right_on = 'id', 
            how = 'right', 
            indicator = True)

# It only shows values that have a role due to the right join

Unnamed: 0,id_x,name_x,role_id,id_y,name_y,_merge
0,1.0,Bob,1.0,1,admin,both
1,2.0,Joe,2.0,2,author,both
2,3.0,Adam,3.0,3,reviewer,both
3,4.0,Sally,3.0,3,reviewer,both
4,,,,4,commenter,right_only


In [5]:
users.merge(roles, 
            left_on = 'role_id', 
            right_on = 'id', 
            how = 'outer', 
            indicator = True)

# An outer join will show everything

Unnamed: 0,id_x,name_x,role_id,id_y,name_y,_merge
0,1.0,Bob,1.0,1.0,admin,both
1,2.0,Joe,2.0,2.0,author,both
2,3.0,Adam,3.0,3.0,reviewer,both
3,4.0,Sally,3.0,3.0,reviewer,both
4,5.0,Jorge,,,,left_only
5,6.0,Mike,,,,left_only
6,,,,4.0,commenter,right_only


In [9]:
users.drop(columns = 'role_id')

Unnamed: 0,id,name
0,1,Bob
1,2,Joe
2,3,Adam
3,4,Sally
4,5,Jorge
5,6,Mike


In [10]:
users.merge(roles)

# It will no longer merge due to having no foreign keys

Unnamed: 0,id,name,role_id


In [12]:
mpg = data('mpg')

In [15]:
data('mpg', show_doc = True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [16]:
mpg.shape

#234 rows 11 columns

(234, 11)

In [17]:
mpg.columns

Index(['manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', 'drv', 'cty',
       'hwy', 'fl', 'class'],
      dtype='object')

In [20]:
mpg.describe()

Unnamed: 0,displ,year,cyl,cty,hwy
count,234.0,234.0,234.0,234.0,234.0
mean,3.471795,2003.5,5.888889,16.858974,23.440171
std,1.291959,4.509646,1.611534,4.255946,5.954643
min,1.6,1999.0,4.0,9.0,12.0
25%,2.4,1999.0,4.0,14.0,18.0
50%,3.3,2003.5,6.0,17.0,24.0
75%,4.6,2008.0,8.0,19.0,27.0
max,7.0,2008.0,8.0,35.0,44.0


In [52]:
len(mpg.manufacturer.unique())

15

In [53]:
len(mpg.model.unique())

38

In [72]:
mpg['mileage_difference'] = mpg.hwy - mpg.cty

In [73]:
mpg['mileage_average'] = (mpg.hwy + mpg.cty)/2

In [79]:
mpg['is_automatic'] = np.where(mpg.trans.str.contains('auto'), 'automatic', 'manual')

In [82]:
(
    
    mpg[['manufacturer', 'mileage_average']]
    .groupby('manufacturer')
    .agg('mean')
    .sort_values('mileage_average', ascending = False)
    .head(1)

)

# Honda has the best mpg

Unnamed: 0_level_0,mileage_average
manufacturer,Unnamed: 1_level_1
honda,28.5


In [81]:
mpg[['is_automatic', 'mileage_average']].groupby('is_automatic').agg('mean').sort_values('mileage_average')

# Manual has the best mpg

Unnamed: 0_level_0,mileage_average
is_automatic,Unnamed: 1_level_1
automatic,19.130573
manual,22.227273
