# SQL Part 2

## Import pandas and MySQLdb

In [1]:
from sqlalchemy import create_engine
import pandas as pd

## Create the connection to MySQL database

- Define the connection to mysql database (mysql://user:passod@host/databse). More detail: http://zetcode.com/db/sqlalchemy/

In [2]:
eng = create_engine("mysql://root:spr18@localhost/PHC7065")
con = eng.connect()

## Create some tables

In [3]:
sqlq = 'DROP TABLE IF EXISTS Subject;'
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e10b55d0>

In [4]:
sqlq = '''        
        CREATE TABLE Subject(
        id INT NOT NULL AUTO_INCREMENT,
        name VARCHAR(20) NOT NULL,
        gender INT NOT NULL,
        age INT NOT NULL,
        race INT NOT NULL,
        county_id INT NOT NULL,
        state_id INT NOT NULL,
        PRIMARY KEY (id)
        );
'''

In [5]:
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e10b5410>

In [6]:
sqlq = 'DROP TABLE IF EXISTS County;'
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e10b5ed0>

In [7]:
sqlq = '''
        CREATE TABLE County(
        id INT NOT NULL,
        state_id INT NOT NULL,
        name VARCHAR(20) NOT NULL,
        income INT NOT NULL,
        PRIMARY KEY (id,state_id)
        );
'''

In [8]:
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e10b58d0>

In [9]:
sqlq = 'DROP TABLE IF EXISTS State;'
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e21e5510>

In [10]:
sqlq = '''
        CREATE TABLE State(
        id INT NOT NULL AUTO_INCREMENT,
        name VARCHAR(20) NOT NULL,
        policy INT NOT NULL,
        date DATE,
        PRIMARY KEY (id)
        );
'''

In [11]:
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb60c423950>

## Insert some data

In [12]:
sqlq = '''
        INSERT INTO Subject(name,gender,age,race,county_id,state_id)
        VALUES  ('John',1,5,1,1,1),
                ('Mary',0,7,2,2,1),
                ('Mike',1,6,3,1,2),
                ('Linda',0,5,1,2,2),
                ('Lucas',1,4,1,1,3),
                ('Aiden',1,10,4,2,3);
'''

In [13]:
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e0e49250>

In [14]:
pd.read_sql(sql="SELECT * FROM Subject",con=con)

Unnamed: 0,id,name,gender,age,race,county_id,state_id
0,1,John,1,5,1,1,1
1,2,Mary,0,7,2,2,1
2,3,Mike,1,6,3,1,2
3,4,Linda,0,5,1,2,2
4,5,Lucas,1,4,1,1,3
5,6,Aiden,1,10,4,2,3


In [15]:
sqlq = '''
        INSERT INTO County(id,state_id,name,income)
        VALUES  (1,1,'Alachua',78987),
                (2,1,'Orange',87689),
                (1,2,'Newton',56765),
                (2,2,'Burke',67890),
                (1,3,'Glenn',98678),
                (2,3,'Kings',87908);
'''

In [16]:
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e10b5e90>

In [17]:
pd.read_sql(sql="SELECT * FROM County",con=con)

Unnamed: 0,id,state_id,name,income
0,1,1,Alachua,78987
1,1,2,Newton,56765
2,1,3,Glenn,98678
3,2,1,Orange,87689
4,2,2,Burke,67890
5,2,3,Kings,87908


In [18]:
sqlq = '''
        INSERT INTO State(name,policy,date)
        VALUES  ('Florida',1,'2002-11-23'),
                ('Georgia',0,NULL),
                ('California',1,'2004-12-23');
'''

In [19]:
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e1b4b810>

In [20]:
pd.read_sql(sql="SELECT * FROM State",con=con)

Unnamed: 0,id,name,policy,date
0,1,Florida,1,2002-11-23
1,2,Georgia,0,
2,3,California,1,2004-12-23


## JOIN

In [21]:
sqlq = '''
        SELECT Subject.name, State.name as state_name, State.policy
        FROM Subject
        JOIN State ON Subject.state_id=State.id
        ;
'''

In [22]:
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,name,state_name,policy
0,John,Florida,1
1,Mary,Florida,1
2,Mike,Georgia,0
3,Linda,Georgia,0
4,Lucas,California,1
5,Aiden,California,1


In [23]:
sqlq = '''
        SELECT Subject.name, State.name as state_name, County.income
        FROM Subject
        JOIN State 
        JOIN County
        ON Subject.state_id=State.id 
            AND Subject.state_id=County.state_id 
            AND Subject.county_id=County.id
        ;
'''

In [24]:
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,name,state_name,income
0,John,Florida,78987
1,Mary,Florida,87689
2,Mike,Georgia,56765
3,Linda,Georgia,67890
4,Lucas,California,98678
5,Aiden,California,87908


## Aggregate Function

In [25]:
sqlq = '''
        SELECT COUNT(id) as n
        FROM Subject
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,n
0,6


In [26]:
sqlq = '''
        SELECT COUNT(id) as n, state_id
        FROM Subject
        GROUP BY state_id
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,n,state_id
0,2,1
1,2,2
2,2,3


## DISTINCT

In [27]:
sqlq = '''
        SELECT COUNT(DISTINCT state_id) as nState
        FROM Subject
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,nState
0,3


## HAVING

In [28]:
sqlq = '''
        SELECT MAX(income) as maxIncome, state_id
        FROM County
        GROUP BY state_id
        HAVING MAX(income)>90000
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,maxIncome,state_id
0,98678,3


## CASE

In [29]:
sqlq = '''
        SELECT name,
               gender,
               CASE WHEN age BETWEEN 1 AND 4 THEN 1
               WHEN age BETWEEN 5 AND 8 THEN 2
               WHEN age > 8 THEN 3
               ELSE NULL
               END AS recodeAge
        FROM Subject
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,name,gender,recodeAge
0,John,1,2
1,Mary,0,2
2,Mike,1,2
3,Linda,0,2
4,Lucas,1,1
5,Aiden,1,3


# NHIS Data

## Import the 2015 Family File and Household File (https://www.cdc.gov/nchs/nhis/nhis_2015_data_release.htm)

In [30]:
family = pd.read_csv('dat/familyxx.csv')
household = pd.read_csv('dat/househld.csv')

In [31]:
# check the data
family.head()

Unnamed: 0,FINT_Y_P,FINT_M_P,FMX,RECTYPE,SRVY_YR,HHX,FM_SIZE,FM_STRCP,FM_TYPE,FM_STRP,...,COVCONF,FHICOST,FMEDBILL,FMEDBPAY,FMEDBNOP,FSAF,FHICOVCT,FHICOVYN,FPRCOOH,FHIEBCCT
0,2015,1,1,60,2015,4,1,11,1,11,...,,0,2,2,,2,1,1,2.0,0.0
1,2015,3,1,60,2015,5,4,41,4,41,...,2.0,3,1,2,2.0,1,4,1,2.0,4.0
2,2015,2,1,60,2015,11,1,11,1,11,...,2.0,2,2,2,,2,1,1,2.0,1.0
3,2015,1,1,60,2015,13,2,21,2,21,...,4.0,4,2,1,,2,2,1,1.0,2.0
4,2015,3,1,60,2015,16,1,11,1,11,...,,4,2,2,,2,1,1,2.0,0.0


In [32]:
household.head()

Unnamed: 0,INTV_QRT,INTV_MON,LIVQRT,ACPTCHLD,REJ_PER,REJ_FAM,ACPT_PER,ACPT_FAM,NON_INTV,RECTYPE,SRVY_YR,HHX,REGION,WTIA_HH,WTFA_HH,STRAT_P,PSU_P
0,1,3,1,,,,,,1.0,10,2015,1,2,3052.5,0,108,2
1,1,3,1,,,,,,1.0,10,2015,2,1,2497.5,0,47,2
2,1,3,1,,,,,,2.0,10,2015,3,1,1086.7,0,19,2
3,1,1,1,0.0,0.0,0.0,1.0,1.0,,10,2015,4,3,3330.0,3330,163,2
4,1,3,1,2.0,0.0,0.0,4.0,1.0,,10,2015,5,4,138.7,185,289,2


## Export the data to MySQL database

In [33]:
eng = create_engine("mysql://root:spr18@localhost/PHC7065")
con = eng.connect()

In [34]:
family.to_sql(name='Family',con=eng,if_exists='replace',chunksize=100)
household.to_sql(name='Household',con=eng,if_exists='replace',chunksize=100)

## Check the exported data

In [35]:
sqlq = '''
        SELECT *
        FROM Family
        LIMIT 10
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,index,FINT_Y_P,FINT_M_P,FMX,RECTYPE,SRVY_YR,HHX,FM_SIZE,FM_STRCP,FM_TYPE,...,COVCONF,FHICOST,FMEDBILL,FMEDBPAY,FMEDBNOP,FSAF,FHICOVCT,FHICOVYN,FPRCOOH,FHIEBCCT
0,0,2015,1,1,60,2015,4,1,11,1,...,,0,2,2,,2,1,1,2.0,0.0
1,1,2015,3,1,60,2015,5,4,41,4,...,2.0,3,1,2,2.0,1,4,1,2.0,4.0
2,2,2015,2,1,60,2015,11,1,11,1,...,2.0,2,2,2,,2,1,1,2.0,1.0
3,3,2015,1,1,60,2015,13,2,21,2,...,4.0,4,2,1,,2,2,1,1.0,2.0
4,4,2015,3,1,60,2015,16,1,11,1,...,,4,2,2,,2,1,1,2.0,0.0
5,5,2015,2,1,60,2015,19,1,11,1,...,4.0,1,2,2,,2,1,1,2.0,1.0
6,6,2015,1,1,60,2015,21,1,11,1,...,3.0,2,2,1,,2,1,1,2.0,1.0
7,7,2015,1,1,60,2015,23,2,21,2,...,1.0,1,2,2,,1,2,1,2.0,2.0
8,8,2015,3,1,60,2015,24,2,21,2,...,,0,2,2,,2,2,1,2.0,0.0
9,9,2015,2,1,60,2015,25,5,41,4,...,4.0,2,2,2,,2,5,1,2.0,5.0


In [36]:
sqlq = '''
        SELECT *
        FROM Household
        LIMIT 10
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,index,INTV_QRT,INTV_MON,LIVQRT,ACPTCHLD,REJ_PER,REJ_FAM,ACPT_PER,ACPT_FAM,NON_INTV,RECTYPE,SRVY_YR,HHX,REGION,WTIA_HH,WTFA_HH,STRAT_P,PSU_P
0,0,1,3,1,,,,,,1.0,10,2015,1,2,3052.5,0,108,2
1,1,1,3,1,,,,,,1.0,10,2015,2,1,2497.5,0,47,2
2,2,1,3,1,,,,,,2.0,10,2015,3,1,1086.7,0,19,2
3,3,1,1,1,0.0,0.0,0.0,1.0,1.0,,10,2015,4,3,3330.0,3330,163,2
4,4,1,3,1,2.0,0.0,0.0,4.0,1.0,,10,2015,5,4,138.7,185,289,2
5,5,1,2,1,,,,,,1.0,10,2015,6,1,668.8,0,45,1
6,6,1,1,1,,,,,,5.0,10,2015,10,1,3330.0,0,31,2
7,7,1,2,1,0.0,0.0,0.0,1.0,1.0,,10,2015,11,4,2200.0,4400,274,2
8,8,1,1,1,0.0,0.0,0.0,2.0,1.0,,10,2015,13,3,3330.0,4171,199,2
9,9,1,2,1,,,,,,2.0,10,2015,14,2,436.3,0,114,1


## Some simple queries

In [37]:
# which quarter was the interview conducted? show frequencies
sqlq = '''
        SELECT INTV_QRT, COUNT(INTV_QRT) AS N_INTV
        FROM Household
        GROUP BY INTV_QRT
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,INTV_QRT,N_INTV
0,1,14680
1,2,15407
2,3,14707
3,4,14376


In [38]:
sqlq = 'DROP TABLE IF EXISTS Temp;'
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e0e2fa90>

In [39]:
# what's the maximum number of families within a household?
sqlq = '''
        CREATE TABLE Temp(
        SELECT COUNT(FMX) AS MAX_N_FAM
        FROM Family
        GROUP BY HHX
        )
        ;
'''
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e0525590>

In [40]:
sqlq = '''
        SELECT MAX(MAX_N_FAM) 
        FROM Temp
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,MAX(MAX_N_FAM)
0,6


In [41]:
# return 1) the highest education level (Family.FM_EDUC1) within a household and 2) the housing type (Household.LIVQRT) for each household

In [42]:
sqlq = 'DROP TABLE IF EXISTS Temp2;'
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e0578a50>

In [43]:
sqlq = '''
        CREATE TABLE Temp2(
        SELECT HHX,FMX,
               CASE WHEN FM_EDUC1 BETWEEN 97 AND 99 THEN NULL
               ELSE FM_EDUC1
               END AS recodeFM_EDUC1
        FROM Family)
        ;
'''
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e0525910>

In [44]:
sqlq = 'DROP TABLE IF EXISTS Temp3;'
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e056a190>

In [45]:
sqlq = '''
        CREATE TABLE Temp3(
        SELECT MAX(recodeFM_EDUC1) AS MaxEDUC, HHX
        FROM Temp2
        GROUP BY Temp2.HHX)
        ;
'''
con.execute(sqlq)

<sqlalchemy.engine.result.ResultProxy at 0x7fb5e0e2fa50>

In [46]:
sqlq = '''
        SELECT Temp3.HHX,Temp3.MaxEDUC,Household.LIVQRT
        FROM Temp3
        LEFT JOIN Household ON Temp3.HHX=Household.HHX
        LIMIT 10
        ;
'''
pd.read_sql(sql=sqlq,con=con)

Unnamed: 0,HHX,MaxEDUC,LIVQRT
0,4,6,1
1,5,8,1
2,11,9,1
3,13,4,1
4,16,8,1
5,19,7,1
6,21,2,1
7,23,8,1
8,24,5,5
9,25,2,1
