# Basic SQL 2 - HAVING, ORDER BY, LIMIT, LIKE, BETWEEN, CASE

This lecture covers more of core SQL commands. Let's read in the mtcars dataframe and fix the indices as we did before.



In [1]:
from pandasql import sqldf
import pandas as pd


    
#Read in the mtcars data set
mtcars = pd.read_csv("Data_Sets/mtcars.csv")


#Change the first column to the index and change the 
mtcars.set_index(mtcars.columns[0], inplace = True)
mtcars.index.name = "name"
mtcars.head()

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## HAVING Clause

- Like WHERE clause but used after a GROUP BY

Here is the general use:

$$
"\text{SELECT FUNC(columnNames) FROM dfName WHERE booleanConditions GROUP BY columnNames HAVING booleanConditions }"
$$

- Filters after groups have been formed

In [2]:
#Using HAVING
sqldf("SELECT cyl,COUNT(*) AS count FROM mtcars GROUP BY cyl HAVING count>8")

Unnamed: 0,cyl,count
0,4,11
1,8,14


In [3]:
#HAVING with a WHERE clause
sqldf("SELECT cyl,COUNT(*) AS count FROM mtcars WHERE mpg>20 GROUP BY cyl HAVING count>8")

Unnamed: 0,cyl,count
0,4,11


## ORDER BY Clause

- ORDER BY determines order in which rows are returned: ASC or DESC

Here is the general use:

$$
"\text{SELECT FUNC(columnNames) FROM dfName WHERE booleanConditions GROUP BY columnNames HAVING booleanConditions ORDER BY columnNames ASC/DESC }"
$$

In [4]:
#Sort by mpg
df = sqldf("SELECT mpg,cyl FROM mtcars ORDER BY mpg DESC")
df.head()

Unnamed: 0,mpg,cyl
0,33.9,4
1,32.4,4
2,30.4,4
3,30.4,4
4,27.3,4


In [5]:
#Order by multiple columns
df = sqldf("SELECT cyl, hp FROM mtcars ORDER BY cyl DESC,hp DESC ")
df.head()

Unnamed: 0,cyl,hp
0,8,335
1,8,264
2,8,245
3,8,245
4,8,230


In [6]:
#Combining everything so far: You can see that the ordering here happens after the groups have been formed
df = sqldf("SELECT cyl, COUNT(*) As count FROM mtcars WHERE \
             mpg>15 GROUP BY cyl HAVING count>3 ORDER BY count DESC")
df

Unnamed: 0,cyl,count
0,4,11
1,8,8
2,6,7


## Limit Clause

- LIMIT controls the number of rows we get
- Good way to find the top 3, for example, of some list
- Can specify a starting point

The general use is:

$$
"\text{SELECT FUNC(columnNames) FROM dfName WHERE booleanConditions GROUP BY columnNames HAVING booleanConditions ORDER BY columnNames ASC/DESC LIMIT number }"
$$

In [7]:
#Sorted by mpg show only 5 rows
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC LIMIT 5")
df

Unnamed: 0,mpg,cyl
0,33.9,4
1,32.4,4
2,30.4,4
3,30.4,4
4,27.3,4


In [17]:
#Sorted by mpg show rows 3-8
df = sqldf("SELECT mpg, cyl FROM mtcars ORDER BY mpg DESC LIMIT 3,5")
df

Unnamed: 0,mpg,cyl
0,30.4,4
1,27.3,4
2,26.0,4
3,24.4,4
4,22.8,4


## Advanced Filtering

In [8]:
#BETWEEN (includes boundaries)
df = sqldf("SELECT mpg FROM mtcars  WHERE mpg BETWEEN 30.4 AND 32.4 ")
df.head(10)

Unnamed: 0,mpg
0,32.4
1,30.4
2,30.4


In [9]:
# NOT BETWEEN (does not include boundaries)
df = sqldf("SELECT mpg FROM mtcars  WHERE mpg NOT BETWEEN 30.4 AND 32.4 ")
df.head(10)

Unnamed: 0,mpg
0,21.0
1,21.0
2,22.8
3,21.4
4,18.7
5,18.1
6,14.3
7,24.4
8,22.8
9,19.2


In [34]:
#IN
df = sqldf("SELECT mpg,cyl FROM mtcars  WHERE cyl IN (4,6) ")
df.head(10)

Unnamed: 0,mpg,cyl
0,21.0,6
1,21.0,6
2,22.8,4
3,21.4,6
4,18.1,6
5,24.4,4
6,22.8,4
7,19.2,6
8,17.8,6
9,32.4,4


In [22]:
#Reset the index so the car names are a row
mtcars.reset_index(inplace = True)
mtcars.head()

Unnamed: 0,name,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [24]:
#Get all Mercedes Cars - This gives us 0 rows! Tries to match exactly
df = sqldf("SELECT mpg,cyl FROM mtcars  WHERE name Like 'Merc' ")
df.head()

Unnamed: 0,mpg,cyl


In [26]:
#Get a specific Mercedes Cars 
df = sqldf("SELECT name, mpg,cyl FROM mtcars  WHERE name Like 'Merc 240D' ")
df.head()

Unnamed: 0,name,mpg,cyl
0,Merc 240D,24.4,4


In [27]:
#% is a substitute for zero or more characters
df = sqldf("SELECT name, mpg,cyl FROM mtcars  WHERE name Like '%Merc%' ")
df

Unnamed: 0,name,mpg,cyl
0,Merc 240D,24.4,4
1,Merc 230,22.8,4
2,Merc 280,19.2,6
3,Merc 280C,17.8,6
4,Merc 450SE,16.4,8
5,Merc 450SL,17.3,8
6,Merc 450SLC,15.2,8


In [29]:
#_ is a substitute for one character 
#(this is how you do two conditions in the like)
df = sqldf("SELECT name, mpg,cyl FROM mtcars  WHERE name Like '_oyo%' \
           OR name Like '%Ford%' ")
df

Unnamed: 0,name,mpg,cyl
0,Toyota Corolla,33.9,4
1,Toyota Corona,21.5,4
2,Ford Pantera L,15.8,8


## Case Clause

- Use in SELECT, WHERE, HAVING
- Use like an IF Statement in Excel

In [10]:
#In SELECT statement we can use it to create a new column
df= sqldf("SELECT hp, CASE WHEN hp>=150 THEN 'Fast' \
          WHEN hp >= 100 AND hp <=150 THEN 'Medium' \
          ELSE 'Slow' END AS Speed FROM mtcars ")
df.head(10)

Unnamed: 0,hp,Speed
0,110,Medium
1,110,Medium
2,93,Slow
3,110,Medium
4,175,Fast
5,105,Medium
6,245,Fast
7,62,Slow
8,95,Slow
9,123,Medium


In [11]:
#With HAVING or WHERE we use it to filter on different criterion
df= sqldf("SELECT cyl, gear,  AVG(mpg) AS avg_mpg FROM mtcars  \
          GROUP BY cyl, gear HAVING CASE WHEN cyl =4 THEN avg_mpg > 20 \
          ELSE avg_mpg>15 END ORDER BY cyl, gear  ")
df

Unnamed: 0,cyl,gear,avg_mpg
0,4,3,21.5
1,4,4,26.925
2,4,5,28.2
3,6,3,19.75
4,6,4,19.75
5,6,5,19.7
6,8,3,15.05
7,8,5,15.4


## Individual Row Functions

Can’t use these functions with Group by

- CEILING(num): Closest integer below num

- FLOOR(num): Closest intger above num

- ROUND(num1, num2): Round num1 to num2 decimal places


In [12]:
#ROUND
df= sqldf("SELECT ROUND(mpg) AS rounded_mpg FROM mtcars")
df.head()

Unnamed: 0,rounded_mpg
0,21.0
1,21.0
2,23.0
3,21.0
4,19.0


## Wine Data Set

In [13]:
#Read in the wine data set 
df_wine = pd.read_csv("Data_Sets/wine.csv")
df_wine.head()


Unnamed: 0,No,Grape,Winery,Appelation,State,Name,Year,Price,Score,Cases,Drink
0,1,Zinfandel,Robert Biale,St. Helena,California,Old Kraft Vineyard,2008,44,93,275.0,now
1,2,Zinfandel,Chiarello Family,Napa Valley,California,Giana,2008,35,93,480.0,now
2,3,Zinfandel,Robert Biale,Napa Valley,California,Black Chicken,2008,40,91,2700.0,2012
3,4,Zinfandel,Robert Biale,Napa Valley,California,Napa Ranches,2008,38,89,525.0,now
4,5,Zinfandel,Robert Biale,St. Helena,California,Varozza Vineyard,2008,44,88,275.0,2012


Find the cheapest wine at each score and sort this by descending score.


In [14]:
#Finding the cheapest wine at each score and sort this
df_rankings = sqldf("SELECT Score, MIN(Price) FROM df_wine GROUP BY Score ORDER BY Score DESC")
df_rankings.head()

Unnamed: 0,Score,MIN(Price)
0,98,43
1,97,135
2,96,135
3,95,33
4,94,40


Find the cheapest wine at each score in each year. First sort by year and then by score

In [15]:
#Find this same thing  by year
df_rankings_year = sqldf("SELECT Year, Score, MIN(Price) FROM df_wine GROUP BY Year, Score ORDER BY Year, Score DESC")
df_rankings_year.head()

Unnamed: 0,Year,Score,MIN(Price)
0,2004,92,75
1,2004,91,50
2,2004,90,45
3,2005,95,60
4,2005,93,29


Which type of grape has the highest average score?

In [16]:
#Which type of grape has the highest average score
df_rankings_grape = sqldf("SELECT Grape, Avg(Score) AS avg_score FROM df_wine GROUP BY Grape ORDER BY avg_score DESC")
df_rankings_grape.head()

Unnamed: 0,Grape,avg_score
0,Cabernet Franc,92.0
1,Chardonnay,91.096154
2,Roussanne,91.0
3,Cabernet Sauvingnon,90.955882
4,Pinot Noir,90.688073


Between the years 2004 and 2006 a wine received a gold medal if it had a score of at least 90. In 2007 and 2008 this threshold increased to 93. After 2008, it increased to 96. How many wines in this list received a gold medal?

In [17]:
#Add the gold label
df_gold_label = sqldf("SELECT *, CASE WHEN (Year BETWEEN 2004 AND 2006) AND Score >=90 THEN 'Gold'\
                WHEN (Year BETWEEN 2007 AND 2008) AND Score >=93 THEN 'Gold'\
                WHEN Year >2008 AND Score >=96 THEN 'Gold'\
                ELSE 'None' END AS medal FROM df_wine ")

#Check to see if it worked
df_gold_label.head()

Unnamed: 0,No,Grape,Winery,Appelation,State,Name,Year,Price,Score,Cases,Drink,medal
0,1,Zinfandel,Robert Biale,St. Helena,California,Old Kraft Vineyard,2008,44,93,275.0,now,Gold
1,2,Zinfandel,Chiarello Family,Napa Valley,California,Giana,2008,35,93,480.0,now,Gold
2,3,Zinfandel,Robert Biale,Napa Valley,California,Black Chicken,2008,40,91,2700.0,2012,
3,4,Zinfandel,Robert Biale,Napa Valley,California,Napa Ranches,2008,38,89,525.0,now,
4,5,Zinfandel,Robert Biale,St. Helena,California,Varozza Vineyard,2008,44,88,275.0,2012,


In [18]:
#Pick out the gold rows
gold_count = sqldf("SELECT COUNT(*) FROM df_gold_label WHERE medal='Gold'")
gold_count

Unnamed: 0,COUNT(*)
0,155
