# B''H

In [1]:
from google.cloud import bigquery

---
### Prerequisite Step 1 - get/set environment variable that contains the key to access BigQuery API

In [2]:
%set_env GOOGLE_APPLICATION_CREDENTIALS=/home/baruch/app-keys/data-science-course-a1544568093e.json

env: GOOGLE_APPLICATION_CREDENTIALS=/home/baruch/app-keys/data-science-course-a1544568093e.json


---

### Prerequisite Step 2 - Instantiate a BigQuery Python client
- Use the **Project ID** not the **Project Name**
- See https://console.cloud.google.com/home/dashboard?project=data-science-course-226116 for the proejct name and ID

In [3]:
bq_client = bigquery.Client(project='data-science-course-226116')

In [4]:
bq_client

<google.cloud.bigquery.client.Client at 0x7ffa2c5114a8>

---

### Get project-id

In [5]:
project = bq_client.project

project

'data-science-course-226116'

---

### List datasets for the client’s project:

In [6]:
datasets = list(bq_client.list_datasets())

datasets

[<google.cloud.bigquery.dataset.DatasetListItem at 0x7ffa58249320>]

In [7]:
for dataset in datasets:  
    print(dataset.dataset_id)

sql_lessons


### List tables for the dataset:

In [8]:
dataset_ref = bq_client.dataset('sql_lessons')

tables = list(bq_client.list_tables(dataset_ref))  # API request(s)

In [9]:
tables

[<google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2b00>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2be0>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2a58>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2b38>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2d68>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2cf8>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a28d0>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2940>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a24e0>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2ba8>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2c88>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2908>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2860>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2828>,
 <google.cloud.bigquery.table.TableListItem at 0x7ffa2c4a2780>,
 <google.cloud.bigquery.table.TableListI

In [10]:
for table in tables:
    print(table.table_id)

Daily_routine
EURUSD_prices
arky_family
auto
branches
company_party
compare_costs
counties
counties_task_5
counties_task_five
departments
employees
employees_with_dupes
employees_with_sha512
google_ads
google_ads_ay
google_ads_barky
google_ads_baruch
google_ads_chaim
google_ads_dani
google_ads_dani_ammended
google_ads_etl_step_1
google_ads_etl_step_2
google_ads_hash_Yossi
google_ads_hash_baruch
google_ads_mendy
google_ads_yossi
group_by_sandbox
join_test_counties
join_test_states
las_vegas
las_vegas_split
las_vegas_temp
life_expectancy
produce_cost
shopping_items
states
stock_exchanges_raw_input
x_stg_create_from_dataframe
x_stg_create_from_query
x_stg_mock_salaries
x_stg_mock_test


---
### Writing SQL query results to a destination table:

#### **`1:`** Setup job config details 

In [11]:
job_config = bigquery.QueryJobConfig()

job_config

<google.cloud.bigquery.job.QueryJobConfig at 0x7ffa2c4f2080>

In [12]:
# Set the destination table. 
table_ref = dataset_ref.table('arky_family')

job_config.destination = table_ref

In [13]:
# With WRITE_TRUNCATE, any existing rows in the table are overwritten by the query results.
job_config.write_disposition = 'WRITE_TRUNCATE'

#### **`2:`** Define the query string

In [14]:
query_str = """
select 'Baruch'  name, 35 age, 'chaf nissan' birthday  union all
select 'Zehava' name, 33 age, 'yud adar beis' birthday  union all
select 'Mendel' name, 5 age, 'yud-tes cheshvan' birthday  union all
select 'Chaigi' name, 0 age, 'yud-beis adar beis' birthday  
"""



#### **`3:`** Run the BigQuery query job

In [15]:
query_job = bq_client.query(
    query_str,
    # Location must match that of the dataset(s) referenced in the query and of the destination table.
    location   = 'US',
    job_config = job_config
)  # API request - starts the query

rows = list(query_job)  # Waits for the query to finish

In [16]:
rows

[Row(('Chaigi', 0, 'yud-beis adar beis'), {'name': 0, 'age': 1, 'birthday': 2}),
 Row(('Mendel', 5, 'yud-tes cheshvan'), {'name': 0, 'age': 1, 'birthday': 2}),
 Row(('Zehava', 33, 'yud adar beis'), {'name': 0, 'age': 1, 'birthday': 2}),
 Row(('Baruch', 35, 'chaf nissan'), {'name': 0, 'age': 1, 'birthday': 2})]

In [17]:
assert len(rows) == 4

---
### Extract a table to Google Cloud Storage:

---

Make sure the project service account has access to GCS

1. See account name above: 
    - data-science-course-a1544568093e.json

2. Find name for that ID `a154456...`
    - https://console.cloud.google.com/iam-admin/serviceaccounts?folder=&organizationId=&project=data-science-course-226116
    - In this case its is `ds-course-service-account`

3. Add Storage Admine Role to that account:
    - https://console.cloud.google.com/iam-admin/iam?project=data-science-course-226116

---

#### **`1:`** Set the GCS destination URI
- https://console.cloud.google.com/storage/browser?project=data-science-course-226116&folder&organizationId

In [18]:
bucket_name     = 'data-sci-class'

destination_uri = 'gs://{}/{}'.format(bucket_name, 'arky-family.csv')

destination_uri

'gs://data-sci-class/arky-family.csv'

#### **`2:`** set the table you want to extract

In [19]:
table_ref       = dataset_ref.table('arky_family')

#### **`3:`** Run the BigQuery extract job

In [20]:
extract_job = bq_client.extract_table(
    table_ref,
    destination_uri,
    # Location must match that of the source table.
    location='US'
)  

extract_job.result()  # Waits for job to complete.

<google.cloud.bigquery.job.ExtractJob at 0x7ffa2c4f2d68>

---

See file in GCS:
- https://console.cloud.google.com/storage/browser/data-sci-class?project=data-science-course-226116&folder&organizationId

---
### Overwrite / replace an existing table with a CSV file from Cloud Storage:

#### **`1:`** Set the destination table

In [21]:
table_ref = dataset_ref.table('arky_family')

#### **`2:`** Set the source GCS file

In [27]:
bucket_name     = 'data-sci-class'

gcs_uri = 'gs://{}/{}'.format(bucket_name, 'arky-family.csv')

#### **`3:`** Setup job config info

In [28]:
job_config = bigquery.LoadJobConfig()

job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.skip_leading_rows = 1

# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV

#### **`4:`** Define the table schema
- https://cloud.google.com/bigquery/docs/schemas#specify-schema-manual-python

In [29]:
job_config.schema = [
    bigquery.SchemaField("name", "STRING"),
    bigquery.SchemaField("age", "NUMERIC"),
    bigquery.SchemaField("birthday", "STRING"),
]

#### **`5:`** Run the load-table job

In [30]:
load_job = bq_client.load_table_from_uri(    
    gcs_uri,
    table_ref,
    job_config = job_config
)  # API request

assert load_job.job_type == 'load'

load_job.result()  # Waits for table load to complete.


<google.cloud.bigquery.job.LoadJob at 0x7ffa2c4f2400>

#### **`6:`** Final assertions

In [32]:
assert load_job.state == 'DONE'

assert bq_client.get_table(table_ref).num_rows == 4