# General Queries

- Recall that a query is just a string, so a we can first a query and then give it to sqldf


In [10]:
import pandas as pd
from pandasql import sqldf


#Read in the mtcars data set
mtcars = pd.read_csv("Data/mtcars.csv", header=0)
mtcars.head()

Unnamed: 0.1,Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [4]:
query= "SELECT cyl, AVG(mpg) FROM mtcars GROUP BY cyl"
sqldf(query)

Unnamed: 0,cyl,AVG(mpg)
0,4,26.663636
1,6,19.742857
2,8,15.1


Using string formatting we can therefore write general queries

In [81]:
#With WHERE clause
cyl=4
query= "SELECT cyl, mpg FROM mtcars WHERE cyl=%d" %cyl
query

'SELECT cyl, mpg FROM mtcars WHERE cyl=4'

In [5]:
sqldf(query)

Unnamed: 0,cyl,AVG(mpg)
0,4,26.663636
1,6,19.742857
2,8,15.1


In [6]:
#With HAVING clause
threshold =15
query= "SELECT cyl, AVG(mpg) FROM mtcars WHERE mpg>=%d GROUP by cyl " %threshold
sqldf(query)

Unnamed: 0,cyl,AVG(mpg)
0,4,26.663636
1,6,19.742857
2,8,16.477778


In [7]:
#With GROUP BY clause
categories = ["cyl", "am"]
query= "SELECT %s , AVG(mpg) FROM mtcars  GROUP by %s" %(categories[0], categories[1])
sqldf(query)

Unnamed: 0,cyl,AVG(mpg)
0,8,17.147368
1,4,24.392308


In [9]:
#In for loop
cylNum = [4,6,8]
for cyl in cylNum:
    query = "SELECT cyl, AVG(mpg) FROM mtcars WHERE cyl = %d" %cyl
    print(sqldf(query))


   cyl   AVG(mpg)
0    4  26.663636
   cyl   AVG(mpg)
0    6  19.742857
   cyl  AVG(mpg)
0    8      15.1


# Joins

Joins let us combine multiple data frames.

Here is the general use:

$$
"\text{SELECT A.columnNames, B.columnNames FROM dfName1 AS A JOIN dfName2 AS B On A.Column = B.Column}"
$$

Let's see some examples in action

In [18]:


#Create doctor df

doctors = pd.DataFrame({"doctor_id":[210,211,212,213],"doc_name":["Jon", "Peter", "Ke", "Pat"],\
                       "degree":["MD","MBBS", "MD", "MD"]})

#Create visits
visits = pd.DataFrame({"doctor_id":[210,214,215,212,212], "patient_name": ["Julia", "TJ", "John", "James", "Jason"],\
                      "year":[2015,2014, 2015, 2016, 2012]})

doctors

Unnamed: 0,degree,doc_name,doctor_id
0,MD,Jon,210
1,MBBS,Peter,211
2,MD,Ke,212
3,MD,Pat,213


In [19]:
visits

Unnamed: 0,doctor_id,patient_name,year
0,210,Julia,2015
1,214,TJ,2014
2,215,John,2015
3,212,James,2016
4,212,Jason,2012


For every visit, I would like to create a table with the doctor that saw the patient (If one exists).

In [24]:
result = sqldf("SELECT A.patient_name,B.doc_name, A.year\
            FROM visits AS A \
            LEFT JOIN doctors AS B ON A.doctor_id = B.doctor_id")
result

Unnamed: 0,patient_name,doc_name,year
0,Julia,Jon,2015
1,TJ,,2014
2,John,,2015
3,James,Ke,2016
4,Jason,Ke,2012


In [12]:
# Full outer join.  We get every pair of rows for the two dfs.
#I always start at the outer join and then filter
sqldf("SELECT * FROM visits JOIN doctors")

Unnamed: 0,doctor_id,patient_name,year,degree,doc_name,doctor_id.1
0,210,Julia,2015,MD,Jon,210
1,210,Julia,2015,MBBS,Peter,211
2,210,Julia,2015,MD,Ke,212
3,210,Julia,2015,MD,Pat,213
4,214,TJ,2014,MD,Jon,210
5,214,TJ,2014,MBBS,Peter,211
6,214,TJ,2014,MD,Ke,212
7,214,TJ,2014,MD,Pat,213
8,215,John,2015,MD,Jon,210
9,215,John,2015,MBBS,Peter,211


In [13]:
#Matching patients to the doctors they see (INNER JOIN)
#Note how we reference columns in different tables
sqldf("SELECT * FROM visits AS V JOIN doctors AS D ON V.doctor_id = D.doctor_id ")

Unnamed: 0,doctor_id,patient_name,year,degree,doc_name,doctor_id.1
0,210,Julia,2015,MD,Jon,210
1,212,James,2016,MD,Ke,212
2,212,Jason,2012,MD,Ke,212


In [14]:
#Same thing but getting rid of redundant columns
sqldf("SELECT D.doctor_id, D.doc_name, V.patient_name, V.year FROM visits AS V JOIN doctors AS D ON V.doctor_id = D.doctor_id ")

Unnamed: 0,doctor_id,doc_name,patient_name,year
0,210,Jon,Julia,2015
1,212,Ke,James,2016
2,212,Ke,Jason,2012


For every visit, I would like to create a table with the doctor that saw the patient (If one exists) in 2015.

In [15]:
#Add second condition in the WHERE, which filters after
#the join has been executed
sqldf("SELECT D.doctor_id, D.doc_name, V.patient_name, V.year FROM visits AS V JOIN doctors AS D ON V.doctor_id = D.doctor_id WHERE year = 2015 ")

Unnamed: 0,doctor_id,doc_name,patient_name,year
0,210,Jon,Julia,2015


Find the number of patients served by each doctor

In [16]:
#Join with a group by. Gives the number of patients each doctor saw
sqldf("SELECT D.doctor_id, D.doc_name, COUNT(*) AS num_visits FROM visits AS V JOIN doctors AS D ON V.doctor_id = D.doctor_id GROUP BY D.doctor_id ")

Unnamed: 0,doctor_id,doc_name,num_visits
0,210,Jon,1
1,212,Ke,2


For every visit, I would like to create a table with the doctor that saw the patient (Put blank if one does not exist).

In [17]:
#Getting all patients with doctors even if we don't know the doctor with a LEFT JOIN (We will get a null or blank in this case)
sqldf("SELECT   V.patient_name, V.year, D.doc_name FROM visits AS V LEFT JOIN doctors AS D ON D.doctor_id = V.doctor_id ")

Unnamed: 0,patient_name,year,doc_name
0,Julia,2015,Jon
1,TJ,2014,
2,John,2015,
3,James,2016,Ke
4,Jason,2012,Ke


## Airlines Example


In [21]:
airlines = pd.read_csv("Data_Sets/Airlines_Data/airlines.csv", )
airlines.head()

Unnamed: 0,Id,Airline,Abbreviation,Country
0,1,United Airlines,UAL,USA
1,2,US Airways,USAir,USA
2,3,Delta Airlines,Delta,USA
3,4,Southwest Airlines,Southwest,USA
4,5,American Airlines,American,USA


In [22]:
flights = pd.read_csv("Data_Sets/Airlines_Data/flights.csv")
flights.head()


Unnamed: 0,Airline,FlightNo,SourceAirport,DestAirport
0,1,28,APG,ASY
1,1,29,ASY,APG
2,1,44,CVO,ACV
3,1,45,ACV,CVO
4,1,54,AHD,AHT


In [23]:
airports = pd.read_csv("Data_Sets/Airlines_Data/airports.csv", encoding='utf8', engine='python')
#Replace spaces in column names
airports.columns = [c.replace(' ', '') for c in airports.columns]
airports.head()

Unnamed: 0,City,AirportCode,AirportName,Country,CountryAbbrev
0,Aberdeen,APG,Phillips AAF,United States,US
1,Aberdeen,ABR,Municipal,United States,US
2,Abilene,DYS,Dyess AFB,United States,US
3,Abilene,ABI,Municipal,United States,US
4,Abingdon,VJI,Virginia Highlands,United States,US


For each airline, find the most popular source and destination city

Gameplan:

- Find the airline associated with each flight (W/ Join)

- Find the city associated with each source/destination airport (W/ Join)

- Get counts for each airline

- Pick out the most popular



In [24]:
#Find the airline associated with each city (W/ Join)
df_step1 = sqldf("SELECT A.Airline, B.FlightNo, B.SourceAirport, B.DestAirport FROM flights AS B JOIN airlines AS A \
ON A.Id = B.Airline ")
df_step1.head()


Unnamed: 0,Airline,FlightNo,SourceAirport,DestAirport
0,United Airlines,28,APG,ASY
1,United Airlines,29,ASY,APG
2,United Airlines,44,CVO,ACV
3,United Airlines,45,ACV,CVO
4,United Airlines,54,AHD,AHT


In [25]:
#Find the city associated with each source/destination airport (W/ Join)
df_step2a  =sqldf("SELECT A.*,B.City As SourceCity FROM df_step1 AS A JOIN \
airports AS B ON B.AirportCode = A.SourceAirport  ")

df_step2  =sqldf("SELECT A.*,B.City As DestCity FROM df_step2a AS A JOIN \
airports AS B ON B.AirportCode = A.DestAirport  ")

df_step2.head()

Unnamed: 0,Airline,FlightNo,SourceAirport,DestAirport,SourceCity,DestCity
0,United Airlines,28,APG,ASY,Aberdeen,Ashley
1,United Airlines,29,ASY,APG,Ashley,Aberdeen
2,United Airlines,44,CVO,ACV,Albany,Arcata
3,United Airlines,45,ACV,CVO,Arcata,Albany
4,United Airlines,54,AHD,AHT,Ardmore,Amchitka


In [26]:
#Get counts for each airlines for each source and destination
df_step3_source = sqldf("SELECT Airline, SourceCity, COUNT(*) \
AS numFlightsSource FROM df_step2 GROUP BY Airline, SourceCity")

df_step3_dest = sqldf("SELECT Airline, DestCity, COUNT(*) \
AS numFlightsDest FROM df_step2 GROUP BY Airline, DestCity")

df_step3_dest.head()

Unnamed: 0,Airline,DestCity,numFlightsDest
0,AirTran Airways,Alexandria LA,1
1,AirTran Airways,Aliceville AL,3
2,AirTran Airways,Amarillo,2
3,AirTran Airways,Ann Arbor MI,2
4,AirTran Airways,Anniston AL,1


In [27]:
#Find Maximums
df_step4_maxsource = sqldf("SELECT Airline, SourceCity,MAX(numFlightsSource) AS popSource \
                           FROM df_step3_source GROUP BY Airline")

df_step4_maxsource.head()

Unnamed: 0,Airline,SourceCity,popSource
0,AirTran Airways,Abilene,5
1,Allegiant Air,Altus,5
2,American Airlines,Albany,5
3,Continental Airlines,Akiak,5
4,Delta Airlines,Appleton,4


## Union and Intersection

These commands allow us to take the union or intersection of a bunch of rows

Here is the general use:

$$
"\text{SELECT columnNames FROM dfName1 UNION/INTERSECTION SELECT columnNames FROM dfName2 }"
$$

Lets see this in action

In [28]:
#Create Two data frames
df1 = pd.DataFrame({ "RushPoints":[12,32,34,12], "Week":[1,2,3,4]})
df1 = df1[["Week", "RushPoints"]]
df1

Unnamed: 0,Week,RushPoints
0,1,12
1,2,32
2,3,34
3,4,12


In [29]:
df2 = pd.DataFrame({"Week":[2,3,5], "RecPoints":[101,310,234]})
df2 = df2[["Week", "RecPoints"]]
df2

Unnamed: 0,Week,RecPoints
0,2,101
1,3,310
2,5,234


Lets say I wanted to combine the reconds and put a zero if one of the points categories did not exists for the given week.

In [31]:
#I'll first do to left joins
left1 = sqldf("SELECT A.Week,A.RushPoints, B.RecPoints FROM df1 AS A LEFT JOIN df2 AS B ON A.Week=B.Week ")
left1


Unnamed: 0,Week,RushPoints,RecPoints
0,1,12,
1,2,32,101.0
2,3,34,310.0
3,4,12,


In [32]:
left2 = sqldf("SELECT A.Week,A.RecPoints, B.RushPoints FROM df2 AS A LEFT JOIN df1 AS B ON A.Week=B.Week ")
#Reset columns
left2 = left2[list(left1.columns)]
left2

Unnamed: 0,Week,RushPoints,RecPoints
0,2,32.0,101
1,3,34.0,310
2,5,,234


In [34]:
#Take the union of the two
final = sqldf("SELECT * FROM left1 UNION SELECT * FROM left2 ")
final

Unnamed: 0,Week,RushPoints,RecPoints
0,1,12.0,
1,2,32.0,101.0
2,3,34.0,310.0
3,4,12.0,
4,5,,234.0


In [35]:
#Replace the NAN with 0
final.fillna(0, inplace = True)
final

Unnamed: 0,Week,RushPoints,RecPoints
0,1,12.0,0.0
1,2,32.0,101.0
2,3,34.0,310.0
3,4,12.0,0.0
4,5,0.0,234.0
