In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pydeequ
from pydeequ import Check, CheckLevel, AnalysisRunner
from pydeequ.analyzers import *
from pydeequ.suggestions import *
from pydeequ.repository import FileSystemMetricsRepository, ResultKey
from pydeequ.verification import VerificationSuite, VerificationResult
import time
import pytest
import import_ipynb
from test_methods import *


# Set Up
spark = create_session()
print(spark.version)


# Analysis tools
# generate_suggestions(parquet_files)
# analyze_file(parquet_files)
# check_constraints(parquet_files)


# Tests
check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 1 *********************************************************
print("test constraint check set 1")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd')
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.isComplete("publishedDate")  \
        #.isUnique("publishedDate")  \
        .isNonNegative("publishedDate") \
        .isComplete("lastModifiedDate")  \
        #.isUnique("lastModifiedDate")  \
        .isNonNegative("lastModifiedDate") \
        .hasSize(lambda x: x >= 100000)) \
    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())

check_results(checkResult.checkResults)
print("*********************************************************")



check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 2 *********************************************************
print("test constraint check set 2")
print("Reading " + 's3://tahoeqa-interim-data/nvd/vendor' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/vendor')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 3 *********************************************************
print("test constraint check set 3")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.cpe_match.val.cpe_name' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.cpe_match.val.cpe_name')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 4 *********************************************************
print("test constraint check set 4")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.cpe_match' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.cpe_match')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 5 *********************************************************
print("test constraint check set 5")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_cve.description.description_data' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_cve.description.description_data')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 6 *********************************************************
print("test constraint check set 6")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_cve.problemtype.problemtype_data.val.description' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_cve.problemtype.problemtype_data.val.description')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 7 *********************************************************
print("test constraint check set 7")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_cve.problemtype.problemtype_data' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_cve.problemtype.problemtype_data')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 8 *********************************************************
print("test constraint check set 8")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_cve.references.reference_data.val.tags' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_cve.references.reference_data.val.tags')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 9 *********************************************************
print("test constraint check set 9")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_cve.references.reference_data' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_cve.references.reference_data')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 10 *********************************************************
print("test constraint check set 10")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 11 *********************************************************
print("test constraint check set 11")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children.val.cpe_match' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children.val.cpe_match')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 12 *********************************************************
print("test constraint check set 12")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children.val.cpe_match.val.cpe_name' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children.val.cpe_match.val.cpe_name')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")


check = Check(spark, CheckLevel.Warning, "NVD Intrim Parquet Data Check")
# tc 13 *********************************************************
print("test constraint check set 13")
print("Reading " + 's3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children.val.children' + " Data")
df = spark.read.parquet('s3://tahoeqa-interim-data/nvd/nvd_configurations.nodes.val.children.val.children')
# time.sleep(10)
checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.hasSize(lambda x: x >= 100000)) \
    .run()
# time.sleep(10)
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
#checkResult_df.show()
print(checkResult_df.select("constraint_status").collect())
check_results(checkResult.checkResults)
print("*********************************************************")
print("*********************************************************")
print("Total Passed Tests = " + str(tpass_count))
print("Total Failed Tests = " + str(tfail_count))
print("Total Tests = " + str(tpass_count + tfail_count))
print("*********************************************************")

#Clean up
spark.sparkContext._gateway.shutdown_callback_server()
spark.stop()




Creating Spark Session
3.1.1-amzn-0
test constraint check set 1
Reading s3://tahoeqa-interim-data/nvd/nvd Data
PythonCallback server restarted!
[Row(constraint_status='Success'), Row(constraint_status='Success'), Row(constraint_status='Success'), Row(constraint_status='Success'), Row(constraint_status='Success')]
Passed - 	CompletenessConstraint(Completeness(publishedDate,None)) passed
Passed - 	ComplianceConstraint(Compliance(publishedDate is non-negative,COALESCE(CAST(publishedDate AS DECIMAL(20,10)), 0.0) >= 0,None)) passed
Passed - 	CompletenessConstraint(Completeness(lastModifiedDate,None)) passed
Passed - 	ComplianceConstraint(Compliance(lastModifiedDate is non-negative,COALESCE(CAST(lastModifiedDate AS DECIMAL(20,10)), 0.0) >= 0,None)) passed
Passed - 	SizeConstraint(Size(None)) passed
*********************************************************
test constraint check set 2
Reading s3://tahoeqa-interim-data/nvd/vendor Data
[Row(constraint_status='Success')]
Passed - 	SizeConstraint(