In [97]:
from pydataset import data

import pandas as pd
import numpy as np

When the instructions say to load a dataset, you can pass the name of the dataset as a string to the data function to load the dataset. You can also view the documentation for the data set by passing the show_doc keyword argument.


In [98]:
mpg = data('mpg') # load the dataset and store it in a variable
# data('mpg', show_doc=True) # view the documentation for the dataset

In [99]:
#1. Load the mpg dataset. Read the documentation for it, and use the data to answer these questions:

data('mpg', show_doc=True)

mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class. 




In [100]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [127]:
#1a. On average, which manufacturer has the best miles per gallon?
mpg["avg_mpg"] = (mpg.cty + mpg.hwy) / 2

mpg.groupby(["manufacturer"]).avg_mpg.mean().idxmax()


'honda'

In [123]:
#1b. How many different manufacturers are there?
mpg.manufacturer.nunique()


15

In [103]:
#1c. How many different models are there?
mpg.model.nunique()

38

In [133]:
#1d. Do automatic or manual cars have better miles per gallon?
#mpg.groupby(["trans"]).miles_per_gallon.mean()
automatics = mpg[mpg.trans.str.contains('auto')]

manuals = mpg[mpg.trans.str.contains('manual')]

print(automatics.avg_mpg.mean())
print(manuals.avg_mpg.mean())

print('Manual cars have better average gas mileage.')

19.130573248407643
22.227272727272727
Manual cars have better average gas mileage.


In [105]:
#2. Joining and Merging
#Copy the users and roles dataframes from the examples above. 
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
users

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [106]:
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


In [134]:
# What do you think a right join would look like? 
right_join = pd.merge(users, roles, left_on='role_id', right_on='id', how='right')
right_join

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1,admin
1,2.0,joe,2.0,2,author
2,3.0,sally,3.0,3,reviewer
3,4.0,adam,3.0,3,reviewer
4,,,,4,commenter


In [135]:
# An outer join? 
outer_join = pd.merge(users, roles, left_on='role_id', right_on='id', how='outer')
outer_join

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1.0,admin
1,2.0,joe,2.0,2.0,author
2,3.0,sally,3.0,3.0,reviewer
3,4.0,adam,3.0,3.0,reviewer
4,5.0,jane,,,
5,6.0,mike,,,
6,,,,4.0,commenter


In [148]:
# What happens if you drop the foreign keys from the dataframes and try to merge them?
users_test = users.drop(columns='role_id')
roles_test = roles.drop(columns='id')

dropped_fks = pd.merge(users_test, roles_test)
dropped_fks
#It shows only the columns id and name, and does not include any rows.


Unnamed: 0,id,name


In [139]:
#3. Getting data from SQL databases
from env import host, user, password


In [140]:
#3a. Create a function named get_db_url. 
# It should accept a username, hostname, password, and database name and return a url formatted like in the examples in this lesson.
def get_db_url(user, host, password, db):
    url = f'mysql+pymysql://{user}:{password}@{host}/{db}'
    return url

In [142]:
#3b. Use your function to obtain a connection to the employees database.
#pd.read_sql('select * from employees', get_db_url(user, host, password, 'employees'))

#or
sql = """
select *
from employees
"""

pd.read_sql(sql, get_db_url(user, host, password, 'employees'))

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12
...,...,...,...,...,...,...
300019,499995,1958-09-24,Dekang,Lichtner,F,1993-01-12
300020,499996,1953-03-07,Zito,Baaz,M,1990-09-27
300021,499997,1961-08-03,Berhard,Lenart,M,1986-04-21
300022,499998,1956-09-05,Patricia,Breugel,M,1993-10-13


In [146]:
#3c. Once you have successfully run a query:
# Intentionally make a typo in the database url. What kind of error message do you see?
sql = """
select *
from employees
"""

pd.read_sql(sql, get_db_url(user, host, password, 'emp'))

OperationalError: (pymysql.err.OperationalError) (1044, "Access denied for user 'easley_1260'@'%' to database 'emp'")
(Background on this error at: http://sqlalche.me/e/13/e3q8)

In [147]:
# Intentionally make an error in your SQL query. What does the error message look like?
sql = """
select *
form emp
"""

pd.read_sql(sql, get_db_url(user, host, password, 'employees'))

ProgrammingError: (pymysql.err.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'form emp' at line 2")
[SQL: 
select *
form emp
]
(Background on this error at: http://sqlalche.me/e/13/f405)

In [155]:
#3d. Read the employees and titles tables into two separate dataframes
sql_employees = """
select * from employees
"""
employees=pd.read_sql(sql_employees, get_db_url(user, host, password, 'employees'))
employees.head()
                      

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


In [156]:
sql_titles = """
select * from titles
"""
titles=pd.read_sql(sql_titles, get_db_url(user, host, password, 'employees'))
titles.head()

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01


In [158]:
#3e. Visualize the number of employees with each title.
employees.info()
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300024 entries, 0 to 300023
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   emp_no      300024 non-null  int64 
 1   birth_date  300024 non-null  object
 2   first_name  300024 non-null  object
 3   last_name   300024 non-null  object
 4   gender      300024 non-null  object
 5   hire_date   300024 non-null  object
dtypes: int64(1), object(5)
memory usage: 13.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443308 entries, 0 to 443307
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   emp_no     443308 non-null  int64 
 1   title      443308 non-null  object
 2   from_date  443308 non-null  object
 3   to_date    443308 non-null  object
dtypes: int64(1), object(3)
memory usage: 13.5+ MB


In [164]:
#3f. Join the employees and titles dataframes together.
emp_and_titles = employees.merge(titles, on='emp_no')
emp_and_titles

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date,title,from_date,to_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26,Senior Engineer,1986-06-26,9999-01-01
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21,Staff,1996-08-03,9999-01-01
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28,Senior Engineer,1995-12-03,9999-01-01
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Engineer,1986-12-01,1995-12-01
4,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01,Senior Engineer,1995-12-01,9999-01-01
...,...,...,...,...,...,...,...,...,...
443303,499997,1961-08-03,Berhard,Lenart,M,1986-04-21,Engineer,1987-08-30,1992-08-29
443304,499997,1961-08-03,Berhard,Lenart,M,1986-04-21,Senior Engineer,1992-08-29,9999-01-01
443305,499998,1956-09-05,Patricia,Breugel,M,1993-10-13,Senior Staff,1998-12-27,9999-01-01
443306,499998,1956-09-05,Patricia,Breugel,M,1993-10-13,Staff,1993-12-27,1998-12-27


In [166]:
#3g. Visualize how frequently employees change titles.
title_changes = emp_and_titles.emp_no.value_counts()
title_changes.value_counts()

1    159754
2    137256
3      3014
Name: emp_no, dtype: int64

In [168]:
#3h. For each title, find the hire date of the employee that was hired most recently with that title.
emp_and_titles.groupby('title').hire_date.max()

#or
# emp_with_titles.groupby('title')['hire_date'].max()

title
Assistant Engineer    1999-12-24
Engineer              2000-01-28
Manager               1992-02-05
Senior Engineer       2000-01-01
Senior Staff          2000-01-13
Staff                 2000-01-12
Technique Leader      1999-12-31
Name: hire_date, dtype: object

In [None]:
#3i. Write the code necessary to create a cross tabulation of the number of titles by department. 
# (Hint: this will involve a combination of SQL and python/pandas code)
pd.crosstab

In [None]:
#4. Use your get_db_url function to help you explore the data from the chipotle database. 
# Use the data to answer the following questions



In [None]:
#4a. What is the total price for each order?


In [None]:
#4b. What are the most popular 3 items?


In [12]:
#4c. Which item has produced the most revenue?


Extra Pandas Exercises and Resources

https://www.w3resource.com/python-exercises/pandas/index.php

https://towardsdatascience.com/20-pandas-functions-that-will-boost-your-data-analysis-process-f5dfdb2f9e05

https://github.com/guipsamora/pandas_exercises

https://github.com/ajcr/100-pandas-puzzles


More Practice!
For even more practice with pandas, you can do the exercises from the SQL module, but instead of using SQL to do the aggregation, sorting, joining, etc, use pandas. That is, read the data from all of the tables into pandas dataframes and manipulate the dataframes.