## Create Tables

#### Imports

In [1]:
from google.cloud.exceptions import NotFound
from google.cloud import bigquery
import pandas as pd
import datetime
import logging
import os 

#### Setup logging

In [2]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

#### Essentials

In [3]:
SERVICE_ACCOUNT_CREDENTIALS = './../credentials/vai-key.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = SERVICE_ACCOUNT_CREDENTIALS

In [4]:
client = bigquery.Client()

#### Create dataset

In [5]:
dataset_id = f"{client.project}.flight_reservations"
logger.info(dataset_id)

arun-genai-bb.flight_reservations


In [6]:
try:
    dataset = client.get_dataset(dataset_id)
    logger.info(f"Dataset {dataset_id} already exists!")
except NotFound:
    dataset = bigquery.Dataset(dataset_id)
    dataset.location = "US"
    dataset = client.create_dataset(dataset)
    logger.info(f"Dataset {dataset_id} created.")

Dataset arun-genai-bb.flight_reservations already exists!


#### Create tables 

##### Create `customers` table

In [7]:
table_id = f"{dataset_id}.customers"
logger.info(table_id)

arun-genai-bb.flight_reservations.customers


In [8]:
customers_schema = [
    bigquery.SchemaField("customer_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("first_name", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("last_name", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("email", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("date_of_birth", "DATE", mode="REQUIRED"),
    bigquery.SchemaField("created_at", "DATETIME", mode="REQUIRED"),
]


In [9]:
try:
    customers_table = client.get_table(table_id)
    logger.info(f"Table {table_id} already exists!")
except NotFound:
    customers_table = bigquery.Table(table_id, schema=customers_schema)
    customers_table = client.create_table(customers_table)  
    logger.info(f"Table {table_id} created.")

Table arun-genai-bb.flight_reservations.customers already exists!


##### Create `flights` table

In [10]:
table_id = f"{dataset_id}.flights"
logger.info(table_id)

arun-genai-bb.flight_reservations.flights


In [11]:
flights_schema = [
    bigquery.SchemaField("flight_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("origin", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("destination", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("departure_datetime", "DATETIME", mode="REQUIRED"),
    bigquery.SchemaField("arrival_datetime", "DATETIME", mode="REQUIRED"),
    bigquery.SchemaField("carrier", "STRING", mode="REQUIRED"),
    bigquery.SchemaField("price", "FLOAT64", mode="REQUIRED"),
]

In [12]:
try:
    flights_table = client.get_table(table_id)
    logger.info(f"Table {table_id} already exists!")
except NotFound:
    flights_table = bigquery.Table(table_id, schema=flights_schema)
    flights_table = client.create_table(flights_table)  
    logger.info(f"Table {table_id} created.")

Table arun-genai-bb.flight_reservations.flights already exists!


##### Create `reservations` table

In [13]:
table_id = f"{dataset_id}.reservations"
logger.info(table_id)

arun-genai-bb.flight_reservations.reservations


In [14]:
reservations_schema = [
    bigquery.SchemaField("reservation_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("customer_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("flight_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("reservation_datetime", "DATETIME", mode="REQUIRED"),
    bigquery.SchemaField("status", "STRING", mode="REQUIRED"),
]

In [15]:
try:
    reservations_table = client.get_table(table_id)
    logger.info(f"Table {table_id} already exists!")
except NotFound:
    reservations_table = bigquery.Table(table_id, schema=reservations_schema)
    reservations_table = client.create_table(reservations_table)  
    logger.info(f"Table {table_id} created.")

Table arun-genai-bb.flight_reservations.reservations already exists!


##### Create `transactions` table

In [16]:
table_id = f"{dataset_id}.transactions"
logger.info(table_id)

arun-genai-bb.flight_reservations.transactions


In [17]:
transactions_schema = [
    bigquery.SchemaField("transaction_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("reservation_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("amount", "FLOAT64", mode="REQUIRED"),
    bigquery.SchemaField("transaction_datetime", "DATETIME", mode="REQUIRED"),
]

In [18]:
try:
    transactions_table = client.get_table(table_id)
    logger.info(f"Table {table_id} already exists!")
except NotFound:
    transactions_table = bigquery.Table(table_id, schema=transactions_schema)
    transactions_table = client.create_table(transactions_table)  
    logger.info(f"Table {table_id} created.")

Table arun-genai-bb.flight_reservations.transactions already exists!


##### Create `loyality_points` table 

In [19]:
table_id = f"{dataset_id}.loyality_points"
logger.info(table_id)

arun-genai-bb.flight_reservations.loyality_points


In [20]:
loyalty_points_schema = [
    bigquery.SchemaField("customer_id", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("points", "INT64", mode="REQUIRED"),
    bigquery.SchemaField("last_updated_datetime", "DATETIME", mode="REQUIRED"),
]

In [21]:
try:
    loyality_points_table = client.get_table(table_id)
    logger.info(f"Table {table_id} already exists!")
except NotFound:
    loyality_points_table = bigquery.Table(table_id, schema=loyalty_points_schema)
    loyality_points_table = client.create_table(loyality_points_table)  
    logger.info(f"Table {table_id} created.")

Table arun-genai-bb.flight_reservations.loyality_points already exists!


#### Populate tables

In [None]:
df = pd.read_csv('./../DATA/Example-2/customers.csv')
df.head()

In [None]:
data = [tuple(row) for row in df.to_numpy()]
client.insert_rows(customers_table, data)

In [None]:
df = pd.read_csv('./../DATA/Example-2/flights.csv')
df.head()

In [None]:
data = [tuple(row) for row in df.to_numpy()]
client.insert_rows(flights_table, data)

In [None]:
df = pd.read_csv('./../DATA/Example-2/reservations.csv')
df.head()

In [None]:
data = [tuple(row) for row in df.to_numpy()]
client.insert_rows(reservations_table, data)

In [None]:
df = pd.read_csv('./../DATA/Example-2/transactions.csv')
df.head()

In [None]:
data = [tuple(row) for row in df.to_numpy()]
client.insert_rows(transactions_table, data)

In [None]:
df = pd.read_csv('./../DATA/Example-2/loyality_points.csv')
df.head()

In [None]:
data = [tuple(row) for row in df.to_numpy()]
client.insert_rows(loyality_points_table, data)

**Note:** Populating tables in BigQuery can take time, typically ranging between 1 to 2 hours.

To check if the tables are fully populated, run the cell provided below.

If the result indicates a non-zero number of rows, this means the tables are populated.

Once verified, execute the next notebook titled `02-text-to-sql.ipynb`.

In [22]:
logger.info(customers_table.num_rows)
logger.info(flights_table.num_rows)
logger.info(reservations_table.num_rows)
logger.info(transactions_table.num_rows)
logger.info(loyality_points_table.num_rows)

20
20
20
16
20
