#### Exercise 1: Create data model 

![Data Model](img/data_model2.png)

#### Exercise 2: 
1. Load dimension tables  

In [2]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery 
 
DATA_DIR = "../data/"
DEFAULT_TICKET_FILE = os.path.join(DATA_DIR, "tickets.json")
PROJECT_NAME = "electric-glyph-372116"
DATASET_NAME = "airlines"
 
# **** TABLE SCHEMAS ****
## Dimension Tables
TABLE_METADATA = {
   'passengers_d': {
       'table_name': 'passengers_d',
       'schema': [
           bigquery.SchemaField('sk_pass_id', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('email', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('first_name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('last_name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('gender', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('birth_date', 'DATE', mode='NULLABLE'),
           bigquery.SchemaField('street', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('zip', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('state', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('start_date', 'DATE', mode='NULLABLE'),
           bigquery.SchemaField('end_date', 'DATE', mode='NULLABLE')
       ]
   }, 'airlines_d': {
       'table_name': 'airlines_d',
       'schema': [
           bigquery.SchemaField('airline_id', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('icao', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('callsign', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('country', 'STRING', mode='NULLABLE')
       ]
   },  'airports_d': {
       'table_name': 'airports_d',
       'schema': [
           bigquery.SchemaField('airport_id', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('country', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('icao', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('latitude', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('longitude', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('altitude', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('tz_timezone', 'STRING', mode='NULLABLE'),
       ]
   }
}

# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
   format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
   level=logging.INFO,
   stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                 # programmatically reassign the logging level

 
# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

# **** CREATE DATASET (IF NEEDED) ****
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created dataset: {dataset.full_dataset_id}")

[DEBUG][2023-01-06 13:18:55,381][2239541206:0068] : Creating bigquery client
[INFO ][2023-01-06 13:18:55,384][2239541206:0071] : Setup Completed
[INFO ][2023-01-06 13:18:55,976][2239541206:0079] : Created Gary's Store dataset: electric-glyph-372116:airlines


In [6]:
# **** CREATE TABLES ****
logger.debug(f"Creating tables:")

for table_id in TABLE_METADATA:
  full_table_id = f"{PROJECT_NAME}.{DATASET_NAME}.{table_id}"
  schema = TABLE_METADATA[f'{table_id}']['schema']
  table = bigquery.Table(full_table_id, schema = schema)
  client.create_table(table)
  logger.info(f"Created table: {full_table_id}")
  # List table Schema
  table_ref = client.get_table(table)
  for column in table_ref.schema:
    print(f"\t{column.name}\t{column.field_type}") 
  print("\n")

[DEBUG][2023-01-06 13:42:11,061][1101560865:0002] : Creating tables:
[INFO ][2023-01-06 13:42:11,429][1101560865:0009] : Created table: electric-glyph-372116.airlines.passengers_d
	sk_pass_id	STRING
	email	STRING
	first_name	STRING
	last_name	STRING
	gender	STRING
	birth_date	DATE
	street	STRING
	city	STRING
	zip	INTEGER
	state	STRING
	start_date	DATE
	end_date	DATE


[INFO ][2023-01-06 13:42:11,974][1101560865:0009] : Created table: electric-glyph-372116.airlines.airlines_d
	airline_id	STRING
	name	STRING
	icao	STRING
	callsign	STRING
	country	STRING


[INFO ][2023-01-06 13:42:12,422][1101560865:0009] : Created table: electric-glyph-372116.airlines.airports_d
	airport_id	STRING
	name	STRING
	city	FLOAT
	country	FLOAT
	icao	FLOAT
	latitude	FLOAT
	longitude	FLOAT
	altitude	FLOAT
	tz_timezone	FLOAT




In [None]:
# **** LOADING TABLES  ****
# def load_table(
#     df: pd.DataFrame, 
#     client: bigquery.Client, 
#     table_name: str, 
#     schema: List[bigquery.SchemaField], 
#     create_disposition: str = 'CREATE_IF_NEEDED', 
#     write_disposition: str = 'WRITE_TRUNCATE'
#     ) -> None:
#     """load dataframe into bigquery table

#     Args:
#         df (pd.DataFrame): dataframe to load
#         client (bigquery.Client): bigquery client
#         table_name (str): full table name including project and dataset id
#         schema (List[bigquery.SchemaField]): table schema with data types
#         create_disposition (str, optional): create table disposition. Defaults to 'CREATE_IF_NEEDED'.
#         write_disposition (str, optional): overwrite table disposition. Defaults to 'WRITE_TRUNCATE'.
#     """
#     # *** run some checks ***
#     # test table name to be full table name including project and dataset name. It must contain to dots
#     assert len(table_name.split('.')) == 3, f"Table name must be a full bigquery table name including project and dataset id: '{table_name}'"
#     # setup bigquery load job:
#     #  create table if needed, replace rows, define the table schema
#     job_config = bigquery.LoadJobConfig(
#         create_disposition=create_disposition,
#         write_disposition=write_disposition,
#         schema=schema
#     )
#     logger.info(f"loading table: '{table_name}'")
#     job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
#     job.result()        # wait for the job to finish
#     # get the resulting table
#     table = client.get_table(table_name)
#     logger.info(f"loaded {table.num_rows} rows into {table.full_table_id}")

# get unique receipts (not line items)
receipts = df[['receipt_id', 'receipt_num', 'receipt_date', 'customer_id', 'item_count', 'receipt_total']].drop_duplicates(keep='first')
logger.info(f"Preparing {len(receipts.index)} unique receipts to load to bigquery")
display(receipts.head(n=10))
# load to bigquery
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['receipts']['table_name']}"
schema = TABLE_METADATA['receipts']['schema']
load_table(receipts, client, table_name, schema)


receipts_lineitems = df[['lineitem_id', 'receipt_id', 'product_id', 'amount', 'subtotal']]
logger.info(f"Preparing {len(receipts_lineitems.index)} receipt line items to load to bigquery")
display(receipts_lineitems.head(n=10))
# load to bigquery
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['receipts_lineitems']['table_name']}"
schema = TABLE_METADATA['receipts_lineitems']['schema']
load_table(receipts_lineitems, client, table_name, schema)
