# Advanced Dataframes

## Examples problems from curriculum

In [1]:
import pandas as pd
import numpy as np
from pydataset import data

Format for connection string: protocol://[user[:password]@]hostname/[database_name]

Example of what one might look like: mysql+pymysql://codeup:p@assw0rd@123.123.123.123/some_db


In [2]:
# import necessary info from env.py file

from env import host, user, password

In [3]:
# create url

url = f'mysql+pymysql://{user}:{password}@{host}/employees'

In [4]:
# read in some data using this url
# format is pd.read_sql(query, url)

pd.read_sql('SELECT * FROM employees LIMIT 5 OFFSET 50', url)

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10051,1953-07-28,Hidefumi,Caine,M,1992-10-15
1,10052,1961-02-26,Heping,Nitsch,M,1988-05-21
2,10053,1954-09-13,Sanjiv,Zschoche,F,1986-02-04
3,10054,1957-04-04,Mayumi,Schueller,M,1995-03-13
4,10055,1956-06-06,Georgy,Dredge,M,1992-04-27


In [5]:
sql = '''
SELECT
    emp_no,
    first_name,
    last_name
FROM employees
WHERE gender = 'F'
LIMIT 100
'''

employees = pd.read_sql(sql, url)
employees.head()

Unnamed: 0,emp_no,first_name,last_name
0,10002,Bezalel,Simmel
1,10006,Anneke,Preusig
2,10007,Tzvetan,Zielinski
3,10009,Sumant,Peac
4,10010,Duangkaew,Piveteau


In [6]:
query = '''
SELECT
    t.title as title,
    d.dept_name as dept_name
FROM titles t
JOIN dept_emp USING (emp_no)
JOIN departments d USING (dept_no)
LIMIT 100
'''

title_dept = pd.read_sql(query, url)
title_dept.head()

Unnamed: 0,title,dept_name
0,Staff,Customer Service
1,Senior Staff,Customer Service
2,Staff,Customer Service
3,Senior Staff,Customer Service
4,Staff,Customer Service


## Exercises I

#### 1. Create a function named get_db_url. It should accept a username, hostname, password, and database name and return a url connection string formatted like in the example at the start of this lesson.

In [7]:
def get_db_url(user, host, password, database):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'

#### 2. Use your function to obtain a connection to the employees database.

In [8]:
pd.read_sql('SELECT * FROM employees LIMIT 5', get_db_url(user, host, password, 'employees'))

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


#### 3. Once you have successfully run a query:


##### a. Intentionally make a typo in the database url. What kind of error message do you see?

In [9]:
#pd.read_sql('SELECT * FROM employees LIMIT 5', get_db_url(user, host, password, 'employeesss'))

# OperationalError: (pymysql.err.OperationalError) (1044, "Access denied for user 'germain_1478'@'%' to database 'employeesss'")
# (Background on this error at: http://sqlalche.me/e/14/e3q8)

##### b. Intentionally make an error in your SQL query. What does the error message look like?

In [10]:
# pd.read_sql('SELECT FROM employees LIMIT 5', get_db_url(user, host, password, 'employees'))

# ProgrammingError: (pymysql.err.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'FROM employees LIMIT 5' at line 1")
# [SQL: SELECT FROM employees LIMIT 5]
# (Background on this error at: http://sqlalche.me/e/14/f405)

#### 4. Read the employees and titles tables into two separate DataFrames.

In [11]:
employees = pd.read_sql('SELECT * FROM employees', get_db_url(user, host, password, 'employees'))
employees.head()

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12


In [12]:
titles = pd.read_sql('SELECT * FROM titles', get_db_url(user, host, password, 'employees'))
titles.head()

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01


#### 5. How many rows and columns do you have in each DataFrame? Is that what you expected?

In [13]:
employees.shape
# yes, this is what I expected (same dimensions as table in SQL Ace)

(300024, 6)

In [14]:
titles.shape
# yes, this is what I expected (same dimensions as table in SQL Ace)

(443308, 4)

#### 6. Display the summary statistics for each DataFrame.

In [15]:
employees.describe()
# not super useful since the emp_no column is the only numerical column in this df

Unnamed: 0,emp_no
count,300024.0
mean,253321.763392
std,161828.23554
min,10001.0
25%,85006.75
50%,249987.5
75%,424993.25
max,499999.0


In [16]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300024 entries, 0 to 300023
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   emp_no      300024 non-null  int64 
 1   birth_date  300024 non-null  object
 2   first_name  300024 non-null  object
 3   last_name   300024 non-null  object
 4   gender      300024 non-null  object
 5   hire_date   300024 non-null  object
dtypes: int64(1), object(5)
memory usage: 13.7+ MB


In [17]:
titles.describe()
# not super useful since the emp_no column is the only numerical column in this df

Unnamed: 0,emp_no
count,443308.0
mean,253075.03443
std,161853.292613
min,10001.0
25%,84855.75
50%,249847.5
75%,424891.25
max,499999.0


In [18]:
titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443308 entries, 0 to 443307
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   emp_no     443308 non-null  int64 
 1   title      443308 non-null  object
 2   from_date  443308 non-null  object
 3   to_date    443308 non-null  object
dtypes: int64(1), object(3)
memory usage: 13.5+ MB


#### 7. How many unique titles are in the titles DataFrame?

In [19]:
titles.title.unique().size

7

#### 8. What is the oldest date in the to_date column?

In [20]:
titles.to_date.min()

datetime.date(1985, 3, 1)

In [21]:
titles.sort_values(by='to_date').head(1)

Unnamed: 0,emp_no,title,from_date,to_date
16064,20869,Engineer,1985-02-17,1985-03-01


#### 9. What is the most recent date in the to_date column?

In [22]:
titles.to_date.max()
# this just means currently employed
# would be more useful to find most recent to date that is not this value

datetime.date(9999, 1, 1)

In [23]:
titles.sort_values(by='to_date', ascending=False).head(1)

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01


In [30]:
titles_cleaned.nlargest(1, 'from_date', keep='all')
# I can get my method to work for from_date column since there aren't any "current" dates here

Unnamed: 0,emp_no,title,from_date,to_date
17374,21763,Senior Staff,2002-08-01,9999-01-01
23635,26000,Senior Staff,2002-08-01,9999-01-01
67904,55876,Senior Staff,2002-08-01,9999-01-01
80194,64174,Senior Engineer,2002-08-01,9999-01-01
116305,88539,Senior Staff,2002-08-01,9999-01-01
118696,90134,Senior Staff,2002-08-01,9999-01-01
119479,90666,Senior Engineer,2002-08-01,9999-01-01
128289,96599,Senior Engineer,2002-08-01,9999-01-01
135653,101563,Senior Staff,2002-08-01,9999-01-01
143272,106707,Senior Staff,2002-08-01,9999-01-01


In [26]:
titles_cleaned = titles.astype({'from_date' : 'datetime64'})

In [27]:
# haven't been able to get to_date column to convert to right data type, says that the date for current isn't valid since it is in the future

In [33]:
# lets try to_datetime() method

#pd.to_datetime(titles.to_date)
#same error: OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 9999-01-01 00:00:00