In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os
from dotenv import load_dotenv
import pandas as pd
from am4894bq.schema import get_schema, df_to_bq_schema

load_dotenv()

bq_project_id = os.getenv('BQ_PROJECT_ID')
table_id = f'{bq_project_id}.tmp.tmp'

In [None]:
# make a df
data = [
    [1, 2],
    [1, 2],
    [1, 2],
]
cols = ['col1', 'col2']
df = pd.DataFrame(data, columns=cols)
print(df.shape)
print(df)

(3, 2)
   col1  col2
0     1     2
1     1     2
2     1     2


In [None]:
# get schema
schema = df_to_bq_schema(df)
print(schema)

[SchemaField('col1', 'INTEGER', 'NULLABLE', None, ()), SchemaField('col2', 'INTEGER', 'NULLABLE', None, ())]


In [None]:
# save to bq
df.to_gbq('tmp.tmp', project_id=bq_project_id, if_exists='replace')

1it [00:02,  2.82s/it]


In [None]:
# now get the schema from bq
schema = get_schema(table_id)
schema

[SchemaField('col1', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('col2', 'INTEGER', 'NULLABLE', None, ())]

In [None]:
# add col3
df['col3'] = 3

# drop col1
df = df.drop(['col1'], axis=1)

# change col2 to string
df['col2'] = df['col2'].astype('str')

# append a new strange row
df_tmp = df.tail(1).copy()
df_tmp['col2'] = 'x'
df = df.append(df_tmp)

print(df.shape)
print(df)

(4, 2)
  col2  col3
0    2     3
1    2     3
2    2     3
2    x     3


In [None]:
new_schema = df_to_bq_schema(df)
new_schema

[SchemaField('col2', 'STRING', 'NULLABLE', None, ()),
 SchemaField('col3', 'INTEGER', 'NULLABLE', None, ())]

In [None]:
def schema_diff(old_schema, new_schema):
    old_schema_dict = {col.name:col for col in old_schema}
    new_schema_dict = {col.name:col for col in new_schema}
    diffs = []
    for col in new_schema_dict:
        if col not in old_schema_dict:
            diffs.append(('add', new_schema_dict[col]))
        elif new_schema_dict[col] != old_schema_dict[col]:
            diffs.append(('update', old_schema_dict[col], new_schema_dict[col]))
    for col in old_schema_dict:
        if col not in new_schema_dict:
            diffs.append(('drop', old_schema_dict[col]))
    return diffs

In [None]:
diffs = schema_diff(schema, new_schema)
diffs

[('update',
  SchemaField('col2', 'INTEGER', 'NULLABLE', None, ()),
  SchemaField('col2', 'STRING', 'NULLABLE', None, ())),
 ('add', SchemaField('col3', 'INTEGER', 'NULLABLE', None, ())),
 ('drop', SchemaField('col1', 'INTEGER', 'NULLABLE', None, ()))]