In [1]:
import pandas as pd
import numpy as np
from pydataset import data
from env import host, user, password

## Exercises 1

#### 3. Create a function named get_db_url. It should accept a username, hostname, password, and database name and return a url connection string formatted like in the example at the start of this lesson.

In [2]:
def get_db_url(host, username, password, database):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

#### 4. Use your function to obtain a connection to the employees database.


In [3]:
url = get_db_url(host, user, password, 'employees')

#### 5. Successfully run a query

In [4]:
sql = 'SELECT * FROM employees LIMIT 10'
pd.read_sql(sql, url)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12
5,10006,1953-04-20,Anneke,Preusig,F,1989-06-02
6,10007,1957-05-23,Tzvetan,Zielinski,F,1989-02-10
7,10008,1958-02-19,Saniya,Kalloufi,M,1994-09-15
8,10009,1952-04-19,Sumant,Peac,F,1985-02-18
9,10010,1963-06-01,Duangkaew,Piveteau,F,1989-08-24


##### 5a. Intentionally make a typo in the database url. What kind of error message do you see?

In [None]:
typo_url = f'myseequill+pymysql://{user}:{password}@{host}/employees'
# sql >> seequill
    
pd.read_sql(sql, typo_url)

# returns a NoSuchModuleError

In [None]:
typo_url2 = f'mysql+pymysql{user}:{password}@{host}/employees'
# missing ://

pd.read_sql(sql, typo_url2)

# returns an ArgumentError

In [None]:
typo_url3 = f'mysql+pymysql://{user}:{password}@{host}/emplyees'
# typo in db name

pd.read_sql(sql, typo_url3)
# returns operational error

#### 5b. Intentionally make an error in your SQL query. What does the error message look like?

In [None]:
typo_sql = 'ELECT * FROM employees LIMIT 10'

pd.read_sql(typo_sql, url)

# Returns a ProgrammingError that contains the SQL syntax error message

#### 6. Read the employees and titles tables into two separate DataFrames.

In [5]:
sql = 'SELECT * FROM employees'
employees_df = pd.DataFrame(pd.read_sql(sql, url))

In [6]:
sql = 'SELECT * FROM titles'
titles_df = pd.DataFrame(pd.read_sql(sql, url))

#### 7. How many rows and columns do you have in each DataFrame? Is that what you expected?

In [7]:
employees_df.shape
# yes, as expected

(300024, 6)

In [8]:
titles_df.shape
# yes, as expected

(443308, 4)

#### 8. Display the summary statistics for each DataFrame.


In [9]:
employees_df.describe()

Unnamed: 0,emp_no
count,300024.0
mean,253321.763392
std,161828.23554
min,10001.0
25%,85006.75
50%,249987.5
75%,424993.25
max,499999.0


In [10]:
titles_df.describe()

Unnamed: 0,emp_no
count,443308.0
mean,253075.03443
std,161853.292613
min,10001.0
25%,84855.75
50%,249847.5
75%,424891.25
max,499999.0


#### 9. How many unique titles are in the titles DataFrame?


In [11]:
titles_df.title.unique().size

7

#### 10. What is the oldest date in the to_date column?

In [12]:
titles_df.to_date.min()

datetime.date(1985, 3, 1)

#### 11. What is the most recent date in the to_date column?


In [13]:
titles_df.to_date.max()

datetime.date(9999, 1, 1)

In [14]:
# excluding the 'current' date
current = titles_df.to_date.max()
titles_df.to_date[titles_df.to_date != current].max()

datetime.date(2002, 8, 1)

## Exercises 2

#### 1. Copy the users and roles DataFrames from the examples.


In [15]:
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})

In [16]:
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})

In [17]:
users

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [18]:
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


#### 2. What is the result of using a right join on the DataFrames?


In [19]:
users.merge(roles, how='right')

Unnamed: 0,id,name,role_id
0,1,admin,
1,2,author,
2,3,reviewer,
3,4,commenter,


#### 3. What is the result of using an outer join on the DataFrames?


In [20]:
users.merge(roles, how='outer', left_on='role_id', right_on='id')

Unnamed: 0,id_x,name_x,role_id,id_y,name_y
0,1.0,bob,1.0,1.0,admin
1,2.0,joe,2.0,2.0,author
2,3.0,sally,3.0,3.0,reviewer
3,4.0,adam,3.0,3.0,reviewer
4,5.0,jane,,,
5,6.0,mike,,,
6,,,,4.0,commenter


#### 4. What happens if you drop the foreign keys from the DataFrames and try to merge them?


In [21]:
new_users = users.drop(columns='role_id')

In [22]:
new_users.merge(roles, how='outer')
# no longer able to join on the proper columns since the foreign key doesn't exist

Unnamed: 0,id,name
0,1,bob
1,2,joe
2,3,sally
3,4,adam
4,5,jane
5,6,mike
6,1,admin
7,2,author
8,3,reviewer
9,4,commenter
