#### Exercise 1: Create data model 

![Data Model](img/data_model2.png)

#### Exercise 2: 
1. Load dimension tables  

In [7]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery 
 
DATA_DIR = "../data/"
DEFAULT_TICKET_FILE = os.path.join(DATA_DIR, "tickets.json")
PROJECT_NAME = "electric-glyph-372116"
DATASET_NAME = "airlines"
 
# **** TABLE SCHEMAS ****
## Dimension Tables
TABLE_METADATA = {
   'passengers_d': {
       'table_name': 'passengers_d',
       'schema': [
           bigquery.SchemaField('sk_pass_id', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('email', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('first_name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('last_name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('gender', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('birth_date', 'DATE', mode='NULLABLE'),
           bigquery.SchemaField('street', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('zip', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('state', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('start_date', 'DATE', mode='NULLABLE'),
           bigquery.SchemaField('end_date', 'DATE', mode='NULLABLE')
       ]
   }, 'airlines_d': {
       'table_name': 'airlines_d',
       'schema': [
           bigquery.SchemaField('airline_id', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('icao', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('callsign', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('country', 'STRING', mode='NULLABLE')
       ]
   },  'airports_d': {
       'table_name': 'airports_d',
       'schema': [
           bigquery.SchemaField('airport_id', 'STRING', mode='REQUIRED'),
           bigquery.SchemaField('name', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('country', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('icao', 'STRING', mode='NULLABLE'),
           bigquery.SchemaField('latitude', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('longitude', 'FLOAT', mode='NULLABLE'),
           bigquery.SchemaField('altitude', 'INTEGER', mode='NULLABLE'),
           bigquery.SchemaField('tz_timezone', 'STRING', mode='NULLABLE'),
       ]
   }
}

# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
   format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
   level=logging.INFO,
   stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                 # programmatically reassign the logging level

 
# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

# **** CREATE DATASET (IF NEEDED) ****
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created dataset: {dataset.full_dataset_id}")

[DEBUG][2023-01-06 13:50:23,555][3829676985:0068] : Creating bigquery client
[INFO ][2023-01-06 13:50:23,562][3829676985:0071] : Setup Completed
[INFO ][2023-01-06 13:50:24,340][3829676985:0079] : Created dataset: electric-glyph-372116:airlines


In [8]:
# **** CREATE TABLES ****
logger.debug(f"Creating tables:")

for table_id in TABLE_METADATA:
  full_table_id = f"{PROJECT_NAME}.{DATASET_NAME}.{table_id}"
  schema = TABLE_METADATA[f'{table_id}']['schema']
  table = bigquery.Table(full_table_id, schema = schema)
  client.create_table(table)
  logger.info(f"Created table: {table_id}")
  # List table Schema
  table_ref = client.get_table(table)
  for column in table_ref.schema:
    print(f"\t{column.name}\t{column.field_type}") 
  print("\n")

[DEBUG][2023-01-06 13:50:28,989][1382955814:0002] : Creating tables:
[INFO ][2023-01-06 13:50:29,235][1382955814:0009] : Created table: passengers_d
	sk_pass_id	STRING
	email	STRING
	first_name	STRING
	last_name	STRING
	gender	STRING
	birth_date	DATE
	street	STRING
	city	STRING
	zip	INTEGER
	state	STRING
	start_date	DATE
	end_date	DATE


[INFO ][2023-01-06 13:50:29,684][1382955814:0009] : Created table: airlines_d
	airline_id	STRING
	name	STRING
	icao	STRING
	callsign	STRING
	country	STRING


[INFO ][2023-01-06 13:50:30,138][1382955814:0009] : Created table: airports_d
	airport_id	STRING
	name	STRING
	city	STRING
	country	STRING
	icao	STRING
	latitude	FLOAT
	longitude	FLOAT
	altitude	INTEGER
	tz_timezone	STRING




In [94]:
# **** READ DATA ****
import json
import datetime as dt

output = []
with open("data/tickets.json", "r") as json_file:
  for line in json_file:
    row = json.loads(line.strip())
    output.append(row)
df = pd.json_normalize(output)





Unnamed: 0,eticket_num,confirmation,ticket_date,price,seat,status,airline.name,airline.iata,airline.icao,airline.callsign,...,passenger.last_name,passenger.gender,passenger.birth_date,passenger.email,passenger.street,passenger.city,passenger.state,passenger.zip,origin,destination
0,498-938211-0795,ZVFDC4,2022-03-23,723.42,31I,active,China Eastern Airlines,MU,CES,CHINA EASTERN,...,Brown,M,1969-02-17,robert.brown.69@hotmail.com,5007 Thomas Way,Lake Hollystad,DC,20027,,
1,482-850738-6048,IL5GUI,2022-03-23,765.18,29B,active,Hawaiian Airlines,HA,HAL,HAWAIIAN,...,Kent,F,1998-08-05,laura.kent.98@hotmail.com,13991 Davis Village,North Catherineborough,PA,16516,,
2,275-207321-8092,CYEFBC,2022-03-21,753.89,26I,active,Wizz Air,W6,WZZ,WIZZ AIR,...,Tucker,F,1965-01-22,lisa.tucker.65@hotmail.com,04135 Marvin Via,North Kristabury,MA,01093,,
3,246-793315-3102,ZNGPC2,2022-03-22,793.89,15A,active,AirAsia,AK,AXM,ASIAN EXPRESS,...,Yates,NB,1975-03-31,matthew.yates.75@yahoo.com,76045 Samantha Road Suite 111,Lake Jeffrey,DE,19898,,
4,091-128904-1226,MGSBD9,2022-03-24,820.25,17F,active,Xiamen Airlines,MF,CXA,XIAMEN AIR,...,Villanueva,NB,1945-08-14,megan.villanueva.45@hotmail.com,848 Melissa Springs Suite 947,Kellerstad,TX,76177,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4091,030-327889-3270,MVIBWK,2022-03-24,600.37,23E,active,Hainan Airlines,HU,CHH,HAINAN,...,Zamora,F,2000-06-14,janice.zamora.00@gmail.com,4759 William Haven Apt. 194,West Corey,KS,66153,,
4092,513-551750-0628,WZZGGB,2022-03-21,583.41,24F,active,Malaysia Airlines,MH,MAS,MALAYSIAN,...,Tucker,F,1965-01-22,lisa.tucker.65@hotmail.com,04135 Marvin Via,North Kristabury,MA,01093,,
4093,118-106280-2530,WUD4KR,2022-03-22,203.45,17H,active,Frontier Airlines,F9,FFT,FRONTIER FLIGHT,...,Zamora,F,2000-06-14,janice.zamora.00@gmail.com,4759 William Haven Apt. 194,West Corey,KS,66153,,
4094,961-278558-3018,VI5039,2022-03-21,554.59,18G,active,Royal Air Maroc,AT,RAM,ROYALAIR MAROC,...,Cook,M,1983-06-14,corey.cook.83@gmail.com,9606 Barton Station Apt. 271,Jacquelinemouth,IN,47081,,


In [None]:
# **** LOADING TABLES  ****
# def load_table(
#     df: pd.DataFrame, 
#     client: bigquery.Client, 
#     table_name: str, 
#     schema: List[bigquery.SchemaField], 
#     create_disposition: str = 'CREATE_IF_NEEDED', 
#     write_disposition: str = 'WRITE_TRUNCATE'
#     ) -> None:
#     """load dataframe into bigquery table

#     Args:
#         df (pd.DataFrame): dataframe to load
#         client (bigquery.Client): bigquery client
#         table_name (str): full table name including project and dataset id
#         schema (List[bigquery.SchemaField]): table schema with data types
#         create_disposition (str, optional): create table disposition. Defaults to 'CREATE_IF_NEEDED'.
#         write_disposition (str, optional): overwrite table disposition. Defaults to 'WRITE_TRUNCATE'.
#     """
#     # *** run some checks ***
#     # test table name to be full table name including project and dataset name. It must contain to dots
#     assert len(table_name.split('.')) == 3, f"Table name must be a full bigquery table name including project and dataset id: '{table_name}'"
#     # setup bigquery load job:
#     #  create table if needed, replace rows, define the table schema
#     job_config = bigquery.LoadJobConfig(
#         create_disposition=create_disposition,
#         write_disposition=write_disposition,
#         schema=schema
#     )
#     logger.info(f"loading table: '{table_name}'")
#     job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
#     job.result()        # wait for the job to finish
#     # get the resulting table
#     table = client.get_table(table_name)
#     logger.info(f"loaded {table.num_rows} rows into {table.full_table_id}")

# get unique receipts (not line items)
receipts = df[['receipt_id', 'receipt_num', 'receipt_date', 'customer_id', 'item_count', 'receipt_total']].drop_duplicates(keep='first')
logger.info(f"Preparing {len(receipts.index)} unique receipts to load to bigquery")
display(receipts.head(n=10))
# load to bigquery
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['receipts']['table_name']}"
schema = TABLE_METADATA['receipts']['schema']
load_table(receipts, client, table_name, schema)


receipts_lineitems = df[['lineitem_id', 'receipt_id', 'product_id', 'amount', 'subtotal']]
logger.info(f"Preparing {len(receipts_lineitems.index)} receipt line items to load to bigquery")
display(receipts_lineitems.head(n=10))
# load to bigquery
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['receipts_lineitems']['table_name']}"
schema = TABLE_METADATA['receipts_lineitems']['schema']
load_table(receipts_lineitems, client, table_name, schema)
