# 4.2 Data Quality Checks

Data quality checks includes

1. Data schema of every dimensional table matches data model
2. No empty table after running ETL data pipeline


In [1]:
import os
import configparser
from pathlib import Path
from pyspark.sql import SparkSession
from helper import thousands_separator
import json
import schema
import importlib
importlib.reload(schema)

<module 'schema' from '/home/workspace/schema.py'>

In [2]:
config = configparser.ConfigParser()
config.read("configuration.cfg", encoding="utf-8-sig")

os.environ["AWS_ACCESS_KEY_ID"] = config["AWS"]["AWS_ACCESS_KEY_ID"]
os.environ["AWS_SECRET_ACCESS_KEY"] = config["AWS"]["AWS_SECRET_ACCESS_KEY"]

SOURCE = config["S3"]["SOURCE_S3_BUCKET"]
DESTINATION = config["S3"]["DEST_S3_BUCKET"]


In [3]:
# spark = (
#     SparkSession.builder.config(
#         "spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11"
#     )
#     .enableHiveSupport()
#     .getOrCreate()
# )

spark = (
    SparkSession.builder.config(
        "spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0"
    )
    .enableHiveSupport()
    .getOrCreate()
)


### 1. Data schema of every dimensional table matches data model


Please refer to [data_dictionary.ipynb](data_dictionary.ipynb).


For local usage use


In [4]:
bucket = Path("output_data")


For AWS S3


In [5]:
# bucket = Path(SOURCE)


In [6]:
tables = []

In [7]:
for file_directory in bucket.iterdir():
    if file_directory.is_dir():
        path = str(file_directory)
        df = spark.read.parquet(path)
        print(f"Path:{path}")
        table = path.split("/")[-1]
        tables.append(table)
        print("Table: " + table)
        schema = df.printSchema()


Path:output_data/city_code
Table: city_code
root
 |-- code: string (nullable = true)
 |-- city: string (nullable = true)

Path:output_data/dim_immigration_airline
Table: dim_immigration_airline
root
 |-- cic_id: long (nullable = true)
 |-- airline: string (nullable = true)
 |-- admin_num: long (nullable = true)
 |-- flight_number: string (nullable = true)
 |-- visa_type: string (nullable = true)
 |-- immi_airline_id: integer (nullable = true)

Path:output_data/dim_demog_statistics
Table: dim_demog_statistics
root
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- median_age: integer (nullable = true)
 |-- avg_household_size: float (nullable = true)
 |-- demog_stat_id: integer (nullable = true)

Path:output_data/fact_immigration
Table: fact_immigration
root
 |-- cic_id: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- city_code: string (nullable = true)
 |-- arrive_date: date (nullable = true)
 |-- departur

In [8]:
tables

['city_code',
 'dim_immigration_airline',
 'dim_demog_statistics',
 'fact_immigration',
 'dim_demog_population',
 'country_code',
 'state_code',
 'dim_immigration_personal',
 'dim_temperature']

In [9]:
city_code = spark.read.parquet(f"{bucket}/city_code")
dim_immigration_airline = spark.read.parquet(f"{bucket}/dim_immigration_airline")
dim_demog_statistics = spark.read.parquet(f"{bucket}/dim_demog_statistics")
dim_demog_population = spark.read.parquet(f"{bucket}/dim_demog_population")
country_code = spark.read.parquet(f"{bucket}/country_code")
state_code = spark.read.parquet(f"{bucket}/state_code")
dim_immigration_personal = spark.read.parquet(f"{bucket}/dim_immigration_personal")
dim_temperature = spark.read.parquet(f"{bucket}/dim_temperature")
fact_immigration = spark.read.parquet(f"{bucket}/fact_immigration")

AttributeError: 'NoneType' object has no attribute 'city_code'

In [10]:
check_schema = [
    city_code == schema.city_code,
    dim_immigration_airline == schema.dim_immigration_airline,
    dim_demog_statistics == schema.dim_demog_statistics,
    dim_demog_population == schema.dim_demog_population,
    country_code == schema.country_code,
    state_code == schema.state_code,
    dim_immigration_personal == schema.dim_immigration_personal,
    dim_temperature == schema.dim_temperature,
    fact_immigration == schema.fact_immigration
]

AttributeError: 'NoneType' object has no attribute 'city_code'

In [None]:
for table, value in zip(check_schema,tables):
    assert !value:
        print("There is miss match!")
        print(f"SOURCE: table")
        
# if any(check_schema):
#     print("There is miss match!")
# else:
#     print("Data schema of every table matches data model")

### 2. No empty table after running ETL data pipeline


In [None]:
for file_directory in bucket.iterdir():
    if file_directory.is_dir():
        path = str(file_directory)
        df = spark.read.parquet(path)
        record_num = df.count()
        if record_num <= 0:
            raise ValueError("This table is empty!")
        else:
            print(
                "Table: "
                + path.split("/")[-1]
                + f" is not empty: total {thousands_separator(record_num)} records."
            )
