In this notebook, we practice with data warehousing. We do have small warehouse with the same star schema as the lecture notes for `salesdb_sales`, `salesdb_item`, `salesdb_customer`, and `salesdb_store`.


**Submission Instruction**

1- Make a copy and replace blank with your name

2- Complete and run all cells

3- Download .ipynb and submit on Gradescope

In [37]:
!pip install mysql-connector-python



In [38]:
import mysql.connector
import pandas as pd

In [39]:
mysql_address  = '131.193.32.85'
mysql_username='de_student'
mysql_password='DE_Student_PaSS'

mysql_database = 'my_dataengineering_dbs'


def get_conn_cur():
    cnx = mysql.connector.connect(user=mysql_username, password=mysql_password,
          host=mysql_address,
          database=mysql_database, port='3306');
    return (cnx, cnx.cursor())

def run_query(query_string):

  conn, cur = get_conn_cur() # get connection and cursor
  cur.execute(query_string) # executing string as before
  my_data = cur.fetchall() # fetch query data as before
  result_df = pd.DataFrame(my_data, columns=cur.column_names)


  cur.close() # close
  conn.close() # close

  return result_df


def sql_head(table_name):
  conn, cur = get_conn_cur() #get connection and cursor

  #get head information
  table_rows_query = """ SELECT * FROM %s LIMIT 5; """ % table_name
  cur.execute(table_rows_query)
  my_data = cur.fetchall() # fetch results

  # Create a dataframe that combines sql table with column names and return
  df = pd.DataFrame(my_data, columns=cur.column_names)

  cur.close()
  conn.close()
  return df

In [40]:
sql_head("salesdb_customer")

Unnamed: 0,custID,cName,gender,age
0,cust1,Amy,F,20
1,cust2,Bob,M,21
2,cust3,Craig,M,25
3,cust4,Doris,F,22


In [41]:
sql_head('salesdb_store')

Unnamed: 0,storeID,city,county,state
0,store1,Palo Alto,Santa Clara,CA
1,store2,Mountain View,Santa Clara,CA
2,store3,Menlo Park,San Mateo,CA
3,store4,Belmont,San Mateo,CA
4,store5,Seattle,King,WA


In [42]:
sql_head('salesdb_item')

Unnamed: 0,itemID,category,color
0,item1,Tshirt,blue
1,item2,Jacket,blue
2,item3,Tshirt,red
3,item4,Jacket,blue
4,item5,Jacket,red


In [43]:
sql_head("salesdb_sales")

Unnamed: 0,storeID,itemID,custID,price
0,store1,item1,cust1,10
1,store1,item1,cust2,15
2,store1,item1,cust3,20
3,store1,item1,cust3,25
4,store1,item2,cust1,30


Now we look at couple of example queries.

 All inexpensive (price < 25) Tshirts sold in California to young people (age < 22)

In [44]:
sql = """
SELECT S.city, I.color, C.cName, F.price
    FROM salesdb_sales F, salesdb_store S, salesdb_item I, salesdb_customer C
    WHERE F.storeID = S.storeID AND F.itemID = I.itemID
    AND F.custID = C.custID AND S.state = 'CA'
    AND I.category = 'Tshirt' AND C.age < 22 AND F.price < 25; """
run_query(sql)

Unnamed: 0,city,color,cName,price
0,Palo Alto,blue,Amy,10
1,Palo Alto,blue,Bob,15
2,Belmont,red,Bob,20
3,Belmont,red,Bob,15
4,Belmont,red,Bob,10


Total sales by store ID and customer name

In [45]:
sql= """
SELECT storeID, cName, sum(price)
FROM salesdb_sales S, salesdb_customer C
WHERE S.custID=C.custID
GROUP BY storeID, cName; """
run_query(sql)

Unnamed: 0,storeID,cName,sum(price)
0,store1,Amy,145.0
1,store1,Bob,95.0
2,store1,Craig,85.0
3,store2,Amy,280.0
4,store2,Bob,215.0
5,store2,Craig,145.0
6,store2,Doris,165.0
7,store3,Bob,330.0
8,store3,Craig,295.0
9,store4,Amy,65.0


"Drill-down" Total sales by store ID, category, and customer

In [46]:
sql = """
SELECT storeID, I.category, cName, sum(price)
FROM salesdb_sales S, salesdb_customer C, salesdb_item I
WHERE S.custID=C.custID AND S.itemID=I.itemID
GROUP BY storeID, I.category, cName; """
run_query(sql)

Unnamed: 0,storeID,category,cName,sum(price)
0,store1,Jacket,Amy,30.0
1,store1,Jacket,Bob,80.0
2,store1,Jacket,Craig,40.0
3,store1,Tshirt,Amy,115.0
4,store1,Tshirt,Bob,15.0
5,store1,Tshirt,Craig,45.0
6,store2,Jacket,Amy,280.0
7,store2,Jacket,Bob,90.0
8,store2,Jacket,Craig,145.0
9,store2,Jacket,Doris,165.0


"Slice" Total sales by store ID, category, and customer for "store6" only


In [47]:
sql = """
SELECT S.storeID, I.category, cName, sum(price)
FROM salesdb_sales S, salesdb_customer C, salesdb_item I, salesdb_store T
WHERE S.custID=C.custID AND S.itemID=I.itemID
AND S.storeID=T.storeID AND T.storeId ='store6'
GROUP BY storeID, I.category, cName; """
run_query(sql)


Unnamed: 0,storeID,category,cName,sum(price)
0,store6,Jacket,Craig,270.0
1,store6,Jacket,Doris,360.0
2,store6,Tshirt,Doris,165.0


"Dice" Total sales by store ID, category, and customer for "store6" and "Jacket" only

In [48]:
sql = """
SELECT S.storeID, I.category, cName, sum(price)
FROM salesdb_sales S, salesdb_customer C, salesdb_item I, salesdb_store T
WHERE S.custID=C.custID AND S.itemID=I.itemID
AND S.storeID=T.storeID AND T.storeId ='store6' AND I.category='Jacket'
GROUP BY storeID, I.category, cName;"""

run_query(sql)

Unnamed: 0,storeID,category,cName,sum(price)
0,store6,Jacket,Craig,270.0
1,store6,Jacket,Doris,360.0


"Roll-up" Total sales by category

In [49]:
sql = """
SELECT I.category, sum(price)
FROM salesdb_sales S, salesdb_item I
WHERE S.itemID=I.itemID
GROUP BY I.category;"""
run_query(sql)

Unnamed: 0,category,sum(price)
0,Jacket,2435.0
1,Tshirt,915.0


Total sales by state, county, city

In [50]:
sql = """
SELECT state, county, city, sum(price)
FROM salesdb_sales F, salesdb_store S
WHERE F.storeID = S.storeID
GROUP BY state, county, city; """
run_query(sql)

Unnamed: 0,state,county,city,sum(price)
0,CA,San Mateo,Belmont,225.0
1,CA,San Mateo,Menlo Park,625.0
2,CA,Santa Clara,Mountain View,805.0
3,CA,Santa Clara,Palo Alto,325.0
4,WA,King,Redmond,795.0
5,WA,King,Seattle,575.0


Total sales by state, county, city WITH ROLLUP. The ROLLUP generates multiple grouping sets based on the columns or expressions specified in the GROUP BY clause, here state, county, and city.


In [51]:
sql = """
SELECT state, county, city, sum(price)
FROM salesdb_sales F, salesdb_store S
WHERE F.storeID = S.storeID
GROUP BY state, county, city WITH ROLLUP; """
run_query(sql)

Unnamed: 0,state,county,city,sum(price)
0,CA,San Mateo,Belmont,225.0
1,CA,San Mateo,Menlo Park,625.0
2,CA,San Mateo,,850.0
3,CA,Santa Clara,Mountain View,805.0
4,CA,Santa Clara,Palo Alto,325.0
5,CA,Santa Clara,,1130.0
6,CA,,,1980.0
7,WA,King,Redmond,795.0
8,WA,King,Seattle,575.0
9,WA,King,,1370.0


Question 1: Write the query to list total sales by state of store and age of customer - [3 points]

In [52]:
sql = """
SELECT state, age, sum(price)
FROM salesdb_sales F, salesdb_store S, salesdb_customer C
WHERE F.storeID = S.storeID
AND F.custID = C.custID
GROUP BY state, age; """
run_query(sql)


Unnamed: 0,state,age,sum(price)
0,CA,20,490.0
1,CA,21,710.0
2,CA,22,165.0
3,CA,25,615.0
4,WA,20,180.0
5,WA,21,225.0
6,WA,22,695.0
7,WA,25,270.0


Question 2: Drill down to items by item color (on the basis of the previous query) - [3 points]  

In [53]:
sql = """
SELECT state, age, color, sum(price)
FROM salesdb_sales F, salesdb_store S, salesdb_customer C, salesdb_item I
WHERE F.storeID = S.storeID
AND F.custID = C.custID
AND F.itemID = I.itemID
GROUP BY state, age, color; """
run_query(sql)

Unnamed: 0,state,age,color,sum(price)
0,CA,20,blue,320.0
1,CA,20,red,170.0
2,CA,21,blue,415.0
3,CA,21,red,295.0
4,CA,22,blue,165.0
5,CA,25,blue,465.0
6,CA,25,red,150.0
7,WA,20,blue,180.0
8,WA,21,blue,70.0
9,WA,21,red,155.0


Question 3: Use "with rollup" with the previous query - [3 points]

In [54]:
sql = """
SELECT state, age, color, sum(price)
FROM salesdb_sales F, salesdb_store S, salesdb_customer C, salesdb_item I
WHERE F.storeID = S.storeID
AND F.custID = C.custID
AND F.itemID = I.itemID
GROUP BY state, age, color WITH ROLLUP; """
run_query(sql)


Unnamed: 0,state,age,color,sum(price)
0,CA,20.0,blue,320.0
1,CA,20.0,red,170.0
2,CA,20.0,,490.0
3,CA,21.0,blue,415.0
4,CA,21.0,red,295.0
5,CA,21.0,,710.0
6,CA,22.0,blue,165.0
7,CA,22.0,,165.0
8,CA,25.0,blue,465.0
9,CA,25.0,red,150.0


Question 4: Slice by listing only items with blue color (on the basis of query of Question 2) - [3 points]

In [55]:
sql = """
SELECT state, age, color, sum(price)
FROM salesdb_sales F, salesdb_store S, salesdb_customer C, salesdb_item I
WHERE F.storeID = S.storeID
AND F.custID = C.custID
AND F.itemID = I.itemID
AND I.color = "blue"
GROUP BY state, age; """
run_query(sql)

Unnamed: 0,state,age,color,sum(price)
0,CA,20,blue,320.0
1,CA,21,blue,415.0
2,CA,22,blue,165.0
3,CA,25,blue,465.0
4,WA,20,blue,180.0
5,WA,21,blue,70.0
6,WA,22,blue,405.0
7,WA,25,blue,95.0


Question 5: Rollup total sales by customer age and item color (on the basis of query of Question 2)  -- the remaining dimensions are customer age and item color. [3 points]

In [56]:
sql = """
SELECT age, color, sum(price)
FROM salesdb_sales F, salesdb_store S, salesdb_customer C, salesdb_item I
WHERE F.storeID = S.storeID
AND F.custID = C.custID
AND F.itemID = I.itemID
GROUP BY age, color WITH ROLLUP; """
run_query(sql)


Unnamed: 0,age,color,sum(price)
0,20.0,blue,500.0
1,20.0,red,170.0
2,20.0,,670.0
3,21.0,blue,485.0
4,21.0,red,450.0
5,21.0,,935.0
6,22.0,blue,570.0
7,22.0,red,290.0
8,22.0,,860.0
9,25.0,blue,560.0
