In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
from google.cloud import bigquery
from am4894bq.schema import get_schema, df_to_bq_schema, schema_diff, update_bq_schema, update_df_schema
from am4894bq.utils import does_table_exist
from am4894bq.pd import df_to_gbq

load_dotenv()

bq_project_id = os.getenv('BQ_PROJECT_ID')
table_id = f'{bq_project_id}.tmp.tmp'

In [None]:
# make a df
data = [
    ['1', '2'],
]
cols = ['col1', 'col2']
df = pd.DataFrame(data, columns=cols)
print(df.shape)
print(df)

(1, 2)
  col1 col2
0    1    2


In [None]:
# get schema
schema = df_to_bq_schema(df)
print(schema)

[SchemaField('col1', 'STRING', 'NULLABLE', None, ()), SchemaField('col2', 'STRING', 'NULLABLE', None, ())]


In [None]:
# save to bq
df.to_gbq('tmp.tmp', project_id=bq_project_id, if_exists='replace')

1it [00:03,  3.14s/it]


In [None]:
# now get the schema from bq
schema = get_schema(table_id)
schema

[SchemaField('col1', 'STRING', 'NULLABLE', None, ()),
 SchemaField('col2', 'STRING', 'NULLABLE', None, ())]

In [None]:
# add col3
df['col3'] = '3'

# drop col1
df = df.drop(['col1'], axis=1)

# append a new strange row
df_tmp = df.tail(1).copy()
df_tmp['col2'] = 'x'
df = df.append(df_tmp)

print(df.shape)
print(df)

(2, 2)
  col2 col3
0    2    3
0    x    3


In [None]:
new_schema = df_to_bq_schema(df)
new_schema

[SchemaField('col2', 'STRING', 'NULLABLE', None, ()),
 SchemaField('col3', 'STRING', 'NULLABLE', None, ())]

In [None]:
# get list of diffs
diffs = schema_diff(old_schema=get_schema(table_id), new_schema=df_to_bq_schema(df))
diffs

[('add', SchemaField('col3', 'STRING', 'NULLABLE', None, ())),
 ('drop', SchemaField('col1', 'STRING', 'NULLABLE', None, ()))]

In [None]:
df.head()

Unnamed: 0,col2,col3
0,2,3
0,x,3


In [None]:
bq_client = bigquery.Client()        

# update the table schema in BigQuery
update_bq_schema(bq_client, table_id, diffs)

# update the df schema to be as expected by BigQuery
df = update_df_schema(bq_client, table_id, diffs, df)

adding SchemaField('col3', 'STRING', 'NULLABLE', None, ()) to netdata-analytics-bi.tmp.tmp
adding col1 to df


In [None]:
df.head()

Unnamed: 0,col1,col2,col3
0,,2,3
0,,x,3


In [None]:
df.to_gbq('tmp.tmp', project_id=bq_project_id, if_exists='append')

1it [00:02,  2.92s/it]


In [None]:
df = pd.read_gbq("select * from tmp.tmp")
print(df.shape)
df

Downloading: 100%|██████████| 3/3 [00:00<00:00, 15.86rows/s]

(3, 3)





Unnamed: 0,col1,col2,col3
0,,2,3.0
1,,x,3.0
2,1.0,2,


In [None]:
# add a final new col
df['col4'] = 'col4'

# drop a col
df = df.drop(['col2'], axis=1)

# save to bq enforcing schema consistency
df_to_gbq(df, 'tmp.tmp', bq_project_id)

print(df.shape)
df

adding SchemaField('col4', 'STRING', 'NULLABLE', None, ()) to netdata-analytics-bi.tmp.tmp
adding col2 to df


1it [00:03,  3.89s/it]

(3, 4)





Unnamed: 0,col1,col3,col4,col2
0,,3.0,col4,
1,,3.0,col4,
2,1.0,,col4,


In [None]:
df = pd.read_gbq("select * from tmp.tmp")
print(df.shape)
df

Downloading: 100%|██████████| 6/6 [00:00<00:00, 14.46rows/s]

(6, 4)





Unnamed: 0,col1,col2,col3,col4
0,1.0,2,,
1,,,3.0,col4
2,,,3.0,col4
3,1.0,,,col4
4,,2,3.0,
5,,x,3.0,
