# BigQuery-Table Data-Audit

---



### Before you begin


1.   Use the [Cloud Resource Manager](https://console.cloud.google.com/cloud-resource-manager) to check if you have access to the Cloud Platform project that has access to the table you want to audit.
2.   Make sure you have query access to the table you want to audit [Cloud Console](https://console.cloud.google.com).
3.   [Enable BigQuery](https://console.cloud.google.com/flows/enableapi?apiid=bigquery) APIs for the project.
4. Provide the variables to apply:

In [5]:
# BigQuery table to audit:
project_id = 'rituals-prod-1149'
dataset_id = 'cdp'
# table_id = 'customer_email'
table_id = 'customer_profile'
# table_id = 'customer_purchase'

# Use exact count (uses more computational resources)
exact_count = True

# Calculate field top-n (bigger n uses more computational resources)
n=3

table_ref = '{}.{}.{}'.format(project_id, dataset_id, table_id)
print('Table reference to audit: \'{}.{}.{}\''.format(project_id, dataset_id, table_id))

Table reference to audit: 'rituals-prod-1149.cdp.customer_profile'


### Provide your credentials to the runtime

In [2]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


### Retrieve BQ table schema

In [6]:
from google.cloud import bigquery
import pandas as pd

client = bigquery.Client(project=project_id)

def get_row_count(table_ref, nesting):
  '''Get the exact row counts.'''
  query = 'SELECT count(*) FROM `{}`'.format(table_ref)
  if nesting:
    for nest in nesting:
      query = query + ', UNNEST({}) {}'.format(nest, nest) 
  query_job = client.query(query)
  return list(query_job.result())[0].values()[0]

def get_field_info(table_ref, nesting, column_name, data_type):
  if data_type.endswith('>'):
    l = len(data_type)
    if data_type.startswith('ARRAY<'):
      nesting.append(column_name)
      return get_field_info(table_ref, nesting, None, data_type[6:l-1])
    elif data_type.startswith('STRUCT<'):
      sub = data_type[7:l-1].split(', ')
      count = get_row_count(table_ref, nesting)
      return [{'table_ref': table_ref, 'nesting': nesting, 'field_name': nesting[-1]+'.'+v.split(' ')[0], 'data_type':  v.split(' ')[1], 'row_count': count} for v in sub]
  else:
    return { 'table_ref': table_ref, 'nesting': nesting, 'field_name': column_name, 'data_type': data_type }

def get_table_schema(project_id, dataset_id, table_id):
  table_ref = '{}.{}.{}'.format(project_id, dataset_id, table_id)
  print('retrieving table schema for BQ table: \'{}\''.format(table_ref))

  table_column_schema = client.query('''
    SELECT * EXCEPT(is_generated, generation_expression, is_stored, is_updatable)
    FROM `{}`.{}.INFORMATION_SCHEMA.COLUMNS
    WHERE table_name="{}"'''.format(project_id, dataset_id, table_id)).to_dataframe()
  
  table_schema = []
  count = get_row_count(table_ref, None)
  for i, row in table_column_schema.iterrows(): 
    column_name = table_column_schema['column_name'][i]
    data_type = table_column_schema['data_type'][i]
    # print('- checking column: \'{}\' [{}/{}]'.format(column_name, i+1, len(table_column_schema)))

    a = get_field_info(table_ref, [], column_name, data_type)
    if isinstance(a, list): 
      for a1 in a:
        table_schema.append(a1)
    else:
      a['row_count'] = count
      table_schema.append(a)
  print('\n')
  return table_schema

table_schema = get_table_schema(project_id, dataset_id, table_id)

df = pd.DataFrame(table_schema)
df

retrieving table schema for BQ table: 'rituals-prod-1149.cdp.customer_profile'




Unnamed: 0,table_ref,nesting,field_name,data_type,row_count
0,rituals-prod-1149.cdp.customer_profile,[],contactId,STRING,9682051
1,rituals-prod-1149.cdp.customer_profile,[],gender,STRING,9682051
2,rituals-prod-1149.cdp.customer_profile,[],dateOfBirth,DATE,9682051
3,rituals-prod-1149.cdp.customer_profile,[],creationDate,TIMESTAMP,9682051
4,rituals-prod-1149.cdp.customer_profile,[],preferredLocale,STRING,9682051
5,rituals-prod-1149.cdp.customer_profile,[],employee,BOOL,9682051
6,rituals-prod-1149.cdp.customer_profile,[],city,STRING,9682051
7,rituals-prod-1149.cdp.customer_profile,[],countryCode,STRING,9682051
8,rituals-prod-1149.cdp.customer_profile,[],emailAddress,STRING,9682051
9,rituals-prod-1149.cdp.customer_profile,[],emailHashCode,STRING,9682051


### Perform data audit on table schema fields

In [7]:
def get_unnest_statement(field):
  return ''.join(', UNNEST({}) AS {}'.format(record_name, record_name) for record_name in field['nesting']) if field['nesting'] else ''

def get_cardinality(field):
  return get_approx_cardinality(field) if not exact_count and field['data_type'] in ('INT64', 'NUMERIC', 'STRING', 'BYTES') else get_exact_cardinality(field)
  # return get_approx_cardinality(field) if field['data_type'] in ('INT64', 'NUMERIC', 'STRING', 'BYTES') else get_exact_cardinality(field)

def get_approx_cardinality(field):
  print('using approx_cardinality')
  query = 'SELECT HLL_COUNT.MERGE(hll_count) approx FROM ( SELECT HLL_COUNT.INIT({}) hll_count FROM `{}`{})'.format(field['field_name'], field['table_ref'], get_unnest_statement(field))
  query_job = client.query(query)
  return list(query_job.result())[0].values()[0]

def get_exact_cardinality(field):
  query = 'SELECT COUNT(DISTINCT({})) exact FROM `{}`{}'''.format(field['field_name'], field['table_ref'], get_unnest_statement(field))
  query_job = client.query(query)
  return list(query_job.result())[0].values()[0]

def count_missing_values(field):
  where_clause = '{} IS NULL'.format(field['field_name']) + (' OR TRIM({}) = ""'.format(field['field_name']) if field['data_type'] in ('STRING') else '')
  query = 'SELECT SUM(1) num_missing FROM `{}`{} WHERE {}'.format(field['table_ref'], get_unnest_statement(field), where_clause)
  query_job = client.query(query)
  result = list(query_job.result())[0].values()[0]
  return result if result else 0

def get_statistics(field):
  field_name = field['field_name']
  avg_expr = 'AVG({})'.format(field_name) if field['data_type'] in ('INT64', 'NUMERIC', 'FLOAT64') else 'CAST(\'NaN\' AS FLOAT64)'
  std_expr = 'STDDEV({})'.format(field_name) if field['data_type'] in ('FLOAT64') else 'CAST(\'NaN\' AS FLOAT64)'
  query = 'SELECT MIN({}) min, MAX({}) max, {} avg, {} std FROM `{}`{}'''.format(field_name, field_name, avg_expr, std_expr, field['table_ref'], get_unnest_statement(field))
  query_job = client.query(query)
  return list(query_job.result())[0].values()

def get_top_n(field, n):
  where_clause = '{} IS NOT NULL'.format(field['field_name']) + (' AND TRIM({}) != ""'.format(field['field_name']) if field['data_type'] in ('STRING') else '')
  query = '''
    SELECT ARRAY_AGG(CONCAT("'", CAST(val AS STRING), "' [", CAST(cnt AS STRING), "]") ORDER BY cnt DESC LIMIT {}) AS top_{}
    FROM (SELECT {} AS val, count(*) AS cnt FROM `{}`{}
    WHERE {} GROUP BY 1)'''.format(n, n, field['field_name'], field['table_ref'], get_unnest_statement(field), where_clause)
  # print(query)
  query_job = client.query(query)
  return ', '.join(list(query_job.result())[0].values()[0])


print('performing data audit on BQ table: \'{}\''.format(table_ref))
for i, field in enumerate(table_schema):
  print('- auditing field: \'{}\' [{}/{}]'.format(field['field_name'], i+1, len(table_schema)))
  field['cardinality'] = get_cardinality(field)
  field['num_missing_values'] = count_missing_values(field)
  min_, max_, avg, stddev = get_statistics(field)
  field['min'] = min_
  field['max'] = max_
  field['avg'] = avg
  field['stddev'] = stddev
  field['top-{}'.format(n)] = get_top_n(field, n)
 
print('\ndone\n')
df = pd.DataFrame(table_schema)
del df['nesting']
df

performing data audit on BQ table: 'rituals-prod-1149.cdp.customer_profile'
- auditing field: 'contactId' [1/23]
- auditing field: 'gender' [2/23]
- auditing field: 'dateOfBirth' [3/23]
- auditing field: 'creationDate' [4/23]
- auditing field: 'preferredLocale' [5/23]
- auditing field: 'employee' [6/23]
- auditing field: 'city' [7/23]
- auditing field: 'countryCode' [8/23]
- auditing field: 'emailAddress' [9/23]
- auditing field: 'emailHashCode' [10/23]
- auditing field: 'loyalty.loyaltyId' [11/23]
- auditing field: 'loyalty.issueDate' [12/23]
- auditing field: 'loyalty.welcomeGiftRedeemed' [13/23]
- auditing field: 'loyalty.tier' [14/23]
- auditing field: 'loyalty.subscribeSource' [15/23]
- auditing field: 'optinNewsletterGeneral' [16/23]
- auditing field: 'optinNewsletterLoyalty' [17/23]
- auditing field: 'optinSMS' [18/23]
- auditing field: 'subscribeSourceDeep' [19/23]
- auditing field: 'subscrSource' [20/23]
- auditing field: 'optinDate' [21/23]
- auditing field: 'optoutDate' [22/

Unnamed: 0,table_ref,field_name,data_type,row_count,cardinality,num_missing_values,min,max,avg,stddev,top-3
0,rituals-prod-1149.cdp.customer_profile,contactId,STRING,9682051,9682051,0,1,99990496,,,"'53230270' [1], '48520146' [1], '52120449' [1]"
1,rituals-prod-1149.cdp.customer_profile,gender,STRING,9682051,141,3881673,,vrouwelijk,,,"'F' [3617786], 'U' [1707127], 'M' [475034]"
2,rituals-prod-1149.cdp.customer_profile,dateOfBirth,DATE,9682051,36351,5396909,0001-01-02,9999-12-20,,,"'1900-01-01' [84081], '1900-06-16' [41527], '1..."
3,rituals-prod-1149.cdp.customer_profile,creationDate,TIMESTAMP,9682051,8920376,12517,0018-02-11 23:00:00+00:00,2019-10-31 23:14:37.050000+00:00,,,"'0018-02-11 23:00:00+00' [1886], '2013-10-22 1..."
4,rituals-prod-1149.cdp.customer_profile,preferredLocale,STRING,9682051,654,106107,,zh_US,,,"'nl_NL' [1726846], 'de_DE' [1508398], 'es_ES' ..."
5,rituals-prod-1149.cdp.customer_profile,employee,BOOL,9682051,2,0,False,True,,,"'false' [9682028], 'true' [23]"
6,rituals-prod-1149.cdp.customer_profile,city,STRING,9682051,79289,8704492,,�ghez�e,,,"'Bruxelles' [16435], 'Antwerpen' [10443], 'Ams..."
7,rituals-prod-1149.cdp.customer_profile,countryCode,STRING,9682051,173,107072,150,ZW,,,"'NL' [1832019], 'DE' [1513656], 'BE' [1181573]"
8,rituals-prod-1149.cdp.customer_profile,emailAddress,STRING,9682051,9670885,5161,\tbunnybraids@hotmail.com,����������������������������������������������...,,,"'null@rituals.com' [12], '4c6bda2ea1e92a3b9d75..."
9,rituals-prod-1149.cdp.customer_profile,emailHashCode,STRING,9682051,9633716,41467,00000060de5495e3030e30bd3a068b275ed84fec97e06d...,fffffe23dfb334487e0bbcbfc1ccbaa1cf56b23ead5c7c...,,,'254b199f43bb82c6c2aeabae6327b717dafe189df7dbb...


### Prepare final data audit DataFrame
Here you can add some more derived metrics, like precentages etc.

In [0]:
data_audit_df = pd.DataFrame(table_schema)
# remove unneeded dataframe columns
del data_audit_df['nesting']
# add derived metrics
data_audit_df['perc_missing'] = 100*data_audit_df['num_missing_values']/data_audit_df['row_count']

### Export to Google Spreadseet

In [9]:
from oauth2client.client import GoogleCredentials
import gspread
import gspread_dataframe as gd
import datetime

gc = gspread.authorize(GoogleCredentials.get_application_default())

now = datetime.datetime.now()
date = now.strftime("%Y%m%d")
spreadhseet_name = 'Data_Audit_{}_{}'.format(table_id, date)

# create new spreadsheet
spreadhseet = gc.create(spreadhseet_name)

# open sheet1 of new spreadsheet and add data audit data
worksheet = gc.open(spreadhseet_name).sheet1
gd.set_with_dataframe(worksheet, data_audit_df)

print('Created \'{}\' in your Google Drive home folder.\n\n{}'.format(spreadhseet_name, spreadhseet.fetch_sheet_metadata()['spreadsheetUrl']))

Created 'Data_Audit_customer_profile_20191101' in your Google Drive home folder.

https://docs.google.com/a/crystalloids.com/spreadsheets/d/1s4UMd-cYhIYVV1Bv8_JhSGx1EP0-SPMpP3FN9dNX_NU/edit
