In [14]:
from google.cloud import bigquery
from dotenv import load_dotenv
import os
import pandera as pa

from sql_guard.validator.SQLValidator import SQLValidator
from sql_guard.translators import SchemaParsers

import pprint
import json

import pandas as pd

load_dotenv()

True

# Getting Data From BigQuery

It's a table thar contains information about students and their courses and grades. Imagine those students are from a Computer Science Major or from a Electrical Engineering Major.  
The ideia is to have each row as the grade of a student.  
```
name: Name of the student  
age: Age of the student  
major: Major of the student (for example, Computer Science, Electrical Engineering etc.)  
semester: Semester of university, for example 1S/2024 indicating the first semester of 2024  
course: Course, for example, Algorithms, Data Structures, Calculus I, Calculus II etc.  
grade: Grade for the course. for example: 0, 5, 10  
failed: Boolean to indicate if the student failed the course based on grade (>=5)  
```

In [2]:
TABLE_PATH = "`central-rampart-451901-k9.test.student`"

In [3]:
client = bigquery.Client.from_service_account_json(json_credentials_path=os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))

# Perform a query.
QUERY = f'''

    SELECT * FROM {TABLE_PATH}

'''

query_job = client.query(QUERY)  # API request
query_result = query_job.result()  # Waits for query to finish

df = query_result.to_dataframe()

df



Unnamed: 0,name,age,major,semester,course,grade,failed
0,Daniel Carter,19,Computer Science,1S/2024,Algorithms,8.5,False
1,Theo Hill,19,Computer Science,1S/2024,Algorithms,9.0,False
2,Jessica Hall,19,Computer Science,1S/2024,Algorithms,8.0,False
3,Liam Carter,19,Computer Science,1S/2024,Algorithms,8.5,False
4,Zackary Hill,19,Computer Science,1S/2024,Algorithms,7.0,False
...,...,...,...,...,...,...,...
98,Nora Turner,21,Computer Science,1S/2024,Data Structures,6.0,True
99,Riley Turner,21,Computer Science,1S/2024,Data Structures,5.0,True
100,Leo Adams,21,Computer Science,1S/2024,Data Structures,5.5,True
101,Fiona Lopez,21,Computer Science,1S/2024,Data Structures,5.0,True


# Validating

## Setting up pa.DataFrameSchema

In [4]:
pandera_schema = pa.DataFrameSchema({

    "name": pa.Column(str, checks=pa.Check.str_matches(r"^[A-Z].*")), # Starting with capital letter
    "age": pa.Column(int, checks=pa.Check.in_range(min_value=15, max_value=150)), # Students must be between 15 and 150 years old
    "major": pa.Column(str, checks=pa.Check.isin(["Computer Science", "Electrical Engineering"])), # Major can only be Computer Science or Electrical Engineering
    "semester": pa.Column(str, checks=pa.Check.equal_to("1S/2024")), # Only grade from 1st semester of 2024
    "course": pa.Column(str, checks=pa.Check.isin(["Algorithms", "Data Structures", "Circuit Analysis", "Calculus I", "Calculus II"])), # Only Alogirthms and Data Structures courses
    "grade": pa.Column(float, checks=pa.Check.between(min_value=0, max_value=10)), # Grade can only be between 0 and 10
    "failed": pa.Column(bool) # Boolean value
})

print(pandera_schema)

<Schema DataFrameSchema(
    columns={
        'name': <Schema Column(name=name, type=DataType(str))>
        'age': <Schema Column(name=age, type=DataType(int64))>
        'major': <Schema Column(name=major, type=DataType(str))>
        'semester': <Schema Column(name=semester, type=DataType(str))>
        'course': <Schema Column(name=course, type=DataType(str))>
        'grade': <Schema Column(name=grade, type=DataType(float64))>
        'failed': <Schema Column(name=failed, type=DataType(bool))>
    },
    checks=[],
    parsers=[],
    coerce=False,
    dtype=None,
    index=None,
    strict=False,
    name=None,
    ordered=False,
    unique_column_names=False,
    metadata=None, 
    add_missing_columns=False
)>


## Validating with Pandera

In [5]:
# Convert pandas Dtypes to python types if possible
df_d = df.convert_dtypes()
df_d = df_d.astype({'failed': bool})

# Validate schema with pandera
pandera_schema.validate(df_d)

Unnamed: 0,name,age,major,semester,course,grade,failed
0,Daniel Carter,19,Computer Science,1S/2024,Algorithms,8.5,False
1,Theo Hill,19,Computer Science,1S/2024,Algorithms,9.0,False
2,Jessica Hall,19,Computer Science,1S/2024,Algorithms,8.0,False
3,Liam Carter,19,Computer Science,1S/2024,Algorithms,8.5,False
4,Zackary Hill,19,Computer Science,1S/2024,Algorithms,7.0,False
...,...,...,...,...,...,...,...
98,Nora Turner,21,Computer Science,1S/2024,Data Structures,6.0,True
99,Riley Turner,21,Computer Science,1S/2024,Data Structures,5.0,True
100,Leo Adams,21,Computer Science,1S/2024,Data Structures,5.5,True
101,Fiona Lopez,21,Computer Science,1S/2024,Data Structures,5.0,True


So, our validation passed and we didn't get any wrong values.  
As a result, pandera returned our original dataframe for us.

Now, imagine our dataframe is too big to fit in local memory when we query BigQuery using the Python Client.  
In this case, it would be smart to use BigQuery's own capability of running SQL as it's built for up to petabytes of processing.

## Validating with SQL Guard

In [6]:
panderaParser = SchemaParsers.SchemaParser.get_parser("pandera")
data_rules = panderaParser.parse(pandera_schema)
pprint.pp(data_rules)

{'name': [ValidationCheck(check_name='is_string',
                          params=None,
                          error_msg=None,
                          ignore_nulls=False),
          ValidationCheck(check_name='regex_contains',
                          params={'value': '^[A-Z].*'},
                          error_msg=None,
                          ignore_nulls=False)],
 'age': [ValidationCheck(check_name='is_integer',
                         params=None,
                         error_msg=None,
                         ignore_nulls=False),
         ValidationCheck(check_name='between',
                         params={'min': 15, 'max': 150},
                         error_msg=None,
                         ignore_nulls=False)],
 'major': [ValidationCheck(check_name='is_string',
                           params=None,
                           error_msg=None,
                           ignore_nulls=False),
           ValidationCheck(check_name='is_in',
                         

In [7]:
sql_schema = SQLValidator(data_rules)
validation_query = sql_schema.generate_sql(from_source=TABLE_PATH)

print(validation_query)


SELECT * FROM `central-rampart-451901-k9.test.student`
            WHERE ((SAFE_CAST(name AS STRING) IS NOT NULL) AND (REGEXP_CONTAINS(name, r'^[A-Z].*'))) AND ((SAFE_CAST(age AS INT) IS NOT NULL) AND (age BETWEEN 15 AND 150)) AND ((SAFE_CAST(major AS STRING) IS NOT NULL) AND (major IN ('Computer Science', 'Electrical Engineering'))) AND ((SAFE_CAST(semester AS STRING) IS NOT NULL) AND (semester = '1S/2024')) AND ((SAFE_CAST(course AS STRING) IS NOT NULL) AND (course IN ('Algorithms', 'Data Structures', 'Circuit Analysis', 'Calculus I', 'Calculus II'))) AND ((SAFE_CAST(grade AS FLOAT64) IS NOT NULL) AND (grade BETWEEN 0 AND 10)) AND ((SAFE_CAST(failed AS BOOL) IS NOT NULL))


In [9]:
query_job = client.query(validation_query)  # API request
query_result = query_job.result()  # Waits for query to finish

df = query_result.to_dataframe()

df



Unnamed: 0,name,age,major,semester,course,grade,failed
0,Daniel Carter,19,Computer Science,1S/2024,Algorithms,8.5,False
1,Theo Hill,19,Computer Science,1S/2024,Algorithms,9.0,False
2,Jessica Hall,19,Computer Science,1S/2024,Algorithms,8.0,False
3,Liam Carter,19,Computer Science,1S/2024,Algorithms,8.5,False
4,Zackary Hill,19,Computer Science,1S/2024,Algorithms,7.0,False
...,...,...,...,...,...,...,...
98,Nora Turner,21,Computer Science,1S/2024,Data Structures,6.0,True
99,Riley Turner,21,Computer Science,1S/2024,Data Structures,5.0,True
100,Leo Adams,21,Computer Science,1S/2024,Data Structures,5.5,True
101,Fiona Lopez,21,Computer Science,1S/2024,Data Structures,5.0,True


And so, we got the same result as the Pandera DataFrameSchema making sure all validations passed.

## Pandera x SQL Guard

In [21]:
# Variables that we are going to use

TABLE_PATH = "`central-rampart-451901-k9.test.student`"

pandera_schema = pa.DataFrameSchema({

    "name": pa.Column(str, checks=pa.Check.str_matches(r"^[A-Z].*")), # Starting with capital letter
    "age": pa.Column(int, checks=pa.Check.in_range(min_value=15, max_value=150)), # Students must be between 15 and 150 years old
    "major": pa.Column(str, checks=pa.Check.isin(["Computer Science", "Electrical Engineering"])), # Major can only be Computer Science or Electrical Engineering
    "semester": pa.Column(str, checks=pa.Check.equal_to("1S/2024")), # Only grade from 1st semester of 2024
    "course": pa.Column(str, checks=pa.Check.isin(["Algorithms", "Data Structures", "Circuit Analysis", "Calculus I", "Calculus II"])), # Only Alogirthms and Data Structures courses
    "grade": pa.Column(float, checks=pa.Check.between(min_value=0, max_value=10)), # Grade can only be between 0 and 10
    "failed": pa.Column(bool) # Boolean value
})

In [30]:
# Functions that we are going to use for comparison

def run_pandera(schema: pa.DataFrameSchema) -> None:
    '''Gets data from BigQuery and run it against DataFrameSchema'''

    # Perform a query.
    QUERY = f'''

        SELECT * FROM {TABLE_PATH}

    '''

    query_job = client.query(QUERY)  # API request
    query_result = query_job.result()  # Waits for query to finish

    df = query_result.to_dataframe()

    # Convert pandas Dtypes to python types if possible
    df_d = df.convert_dtypes()
    df_d = df_d.astype({'failed': bool})

    # Validate schema with pandera
    print("--------RUN_PANDERA--------")
    try:
        pandera_schema.validate(df_d, lazy=True)
        print("All validations passed!")
    except pa.errors.SchemaErrors as exc:
        print(json.dumps(exc.message, indent=2))
    print()



def run_sql_guard(schema: pa.DataFrameSchema) -> None:
    panderaParser = SchemaParsers.SchemaParser.get_parser("pandera")
    data_rules = panderaParser.parse(pandera_schema)
    sql_schema = SQLValidator(data_rules)
    validation_query = sql_schema.generate_sql_report(from_source=TABLE_PATH)

    query_job = client.query(validation_query)  # API request
    query_result = query_job.result()  # Waits for query to finish

    df = query_result.to_dataframe()

    print("--------RUN_SQL_GUARD--------")
    print(df.to_string())
    print()

In [23]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

run_pandera(pandera_schema)
run_sql_guard(pandera_schema)



--------RUN_PANDERA--------
All validations passed!





--------RUN_SQL_GUARD--------
Empty DataFrame
Columns: [column_name, check_name, params, error_msg, ignore_nulls, wrong_value]
Index: []



In [31]:
pandera_schema = pa.DataFrameSchema({

    "name": pa.Column(str, checks=pa.Check.str_matches(r"^[A-Z].*")), # Starting with capital letter
    "age": pa.Column(int, checks=pa.Check.in_range(min_value=15, max_value=21)), # Students must be between 15 and 150 years old
    "major": pa.Column(str, checks=pa.Check.isin(["Computer Science"])), # Major can only be Computer Science
    "semester": pa.Column(str, checks=pa.Check.equal_to("1S/2024")), # Only grade from 1st semester of 2024
    "course": pa.Column(str, checks=pa.Check.isin(["Algorithms", "Data Structures", "Calculus I"])), # Only Alogirthms and Data Structures courses
    "grade": pa.Column(float, checks=pa.Check.between(min_value=0, max_value=10)), # Grade can only be between 0 and 10
    "failed": pa.Column(bool) # Boolean value
})

run_pandera(pandera_schema)
run_sql_guard(pandera_schema)



--------RUN_PANDERA--------
{
  "DATA": {
    "DATAFRAME_CHECK": [
      {
        "schema": null,
        "column": "age",
        "check": "in_range(15, 21)",
        "error": "Column 'age' failed element-wise validator number 0: in_range(15, 21) failure cases: 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22"
      },
      {
        "schema": null,
        "column": "major",
        "check": "isin(['Computer Science'])",
        "error": "Column 'major' failed element-wise validator number 0: isin(['Computer Science']) failure cases: Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Engineering, Electrical Enginee



--------RUN_SQL_GUARD--------
  column_name check_name                                                      params  error_msg  ignore_nulls             wrong_value
0      course      is_in  {'value': ['Algorithms', 'Data Structures', 'Calculus I']}       <NA>         False             Calculus II
1      course      is_in  {'value': ['Algorithms', 'Data Structures', 'Calculus I']}       <NA>         False        Circuit Analysis
2       major      is_in                             {'value': ['Computer Science']}       <NA>         False  Electrical Engineering
3         age    between                                      {'min': 15, 'max': 21}       <NA>         False                      22

