In [1]:
import os
import sys
import json

notebook_path = os.getcwd()
project_path = os.path.abspath(os.path.join(notebook_path, '..','..','..'))
print(f"Project path: {project_path}")
# Add the project path to sys.path
sys.path.append(project_path)

Project path: c:\Users\dalej\Documents\_Coding\DragonRegen


## Load Data Schema

In [2]:
# load the schema from the JSON file
# output_file_path = os.path.join(project_path, r'docs\AIGuardian\Tasks\output_schema_kafka.json')
output_file_path = os.path.join(project_path, r'docs\AIGuardian\Tasks\tvad_final_schema_kafka.json')

with open(output_file_path, 'r') as output_file:
    loaded_schema = json.load(output_file)

print(loaded_schema.keys())
# print(json.dumps(loaded_schema['customers'], indent=2))

dict_keys(['companies', 'ad_spots', 'attribution_results', 'data_quality_logs', 'attribution_models', 'data_sources', 'ad_campaigns', 'ad_creatives', 'sales_data', 'tv_networks'])


In [4]:
print(json.dumps(loaded_schema['ad_campaigns'], indent=2))

{
  "purpose": "Tracks television ad campaigns run by companies",
  "fields": [
    "{\"name\": \"campaign_id\", \"type\": \"Integer\", \"nullable\": false, \"metadata\": {\"description\": \"unique ID for each campaign\", \"unique_fl\": true, \"default_value\": null, \"col_type\": null}}",
    "{\"name\": \"company_id\", \"type\": \"Integer\", \"nullable\": false, \"metadata\": {\"description\": \"Foreign key to companies table that identifies which company ran the television ad campaign.\", \"unique_fl\": false, \"default_value\": null, \"col_type\": null}}",
    "{\"name\": \"goals\", \"type\": \"String\", \"nullable\": true, \"metadata\": {\"description\": \"campaign objectives and KPIs\", \"unique_fl\": false, \"default_value\": null, \"col_type\": null}}",
    "{\"name\": \"end_date\", \"type\": \"Date\", \"nullable\": true, \"metadata\": {\"description\": \"end date of the campaign\", \"unique_fl\": false, \"default_value\": null, \"col_type\": null}}",
    "{\"name\": \"status\"

In [5]:
fields = loaded_schema['ad_campaigns']['fields']
for field in fields:
    # if field['name'] in ['customer_id', 'first_name', 'zip_code']:
    print(f"Field [{type(field)}]: {field}")
    print(json.dumps(field, indent=2))

Field [<class 'str'>]: {"name": "campaign_id", "type": "Integer", "nullable": false, "metadata": {"description": "unique ID for each campaign", "unique_fl": true, "default_value": null, "col_type": null}}
"{\"name\": \"campaign_id\", \"type\": \"Integer\", \"nullable\": false, \"metadata\": {\"description\": \"unique ID for each campaign\", \"unique_fl\": true, \"default_value\": null, \"col_type\": null}}"
Field [<class 'str'>]: {"name": "company_id", "type": "Integer", "nullable": false, "metadata": {"description": "Foreign key to companies table that identifies which company ran the television ad campaign.", "unique_fl": false, "default_value": null, "col_type": null}}
"{\"name\": \"company_id\", \"type\": \"Integer\", \"nullable\": false, \"metadata\": {\"description\": \"Foreign key to companies table that identifies which company ran the television ad campaign.\", \"unique_fl\": false, \"default_value\": null, \"col_type\": null}}"
Field [<class 'str'>]: {"name": "goals", "type":

In [6]:
from pyspark.sql.types import *

col = loaded_schema['ad_campaigns']['fields'][0]
print(type(col))
print(col)
dict_col = json.loads(col)
print(json.dumps(dict_col, indent=2))
# schema_col = StructField.fromJson(json.loads(col))

<class 'str'>
{"name": "campaign_id", "type": "Integer", "nullable": false, "metadata": {"description": "unique ID for each campaign", "unique_fl": true, "default_value": null, "col_type": null}}
{
  "name": "campaign_id",
  "type": "Integer",
  "nullable": false,
  "metadata": {
    "description": "unique ID for each campaign",
    "unique_fl": true,
    "default_value": null,
    "col_type": null
  }
}


In [5]:
from src.DataCreator.SchemaGenerators.SchemaSpark import SchemaSpark

# create spark and t-sql schemas with in there engines
sturct_col = SchemaSpark.generate_schema(loaded_schema)
print(loaded_schema.keys())
print(sturct_col.keys())
print('=='*20)
print(sturct_col['ad_campaigns'])


dict_keys(['companies', 'ad_spots', 'attribution_results', 'data_quality_logs', 'attribution_models', 'data_sources', 'ad_campaigns', 'ad_creatives', 'sales_data', 'tv_networks'])
dict_keys(['companies', 'ad_spots', 'attribution_results', 'data_quality_logs', 'attribution_models', 'data_sources', 'ad_campaigns', 'ad_creatives', 'sales_data', 'tv_networks'])
StructType([StructField('campaign_id', IntegerType(), False), StructField('company_id', IntegerType(), False), StructField('goals', StringType(), True), StructField('end_date', DateType(), True), StructField('status', StringType(), False), StructField('campaign_name', StringType(), False), StructField('start_date', DateType(), False), StructField('budget', FloatType(), False), StructField('target_audience', StringType(), True), StructField('created_at', TimestampType(), False), StructField('updated_at', TimestampType(), False)])


In [12]:
# Confirm Metadata is being passed correctly
print(sturct_col['quotes']['payment_frequency'])
col_ex = sturct_col['quotes']['payment_frequency']
print(col_ex.metadata)

StructField('payment_frequency', StringType(), True)
{'description': 'frequency of payments (monthly, quarterly, annually)', 'unique_fl': False, 'default_value': None, 'col_type': None}


## Create DataFrames

In [6]:
from src.DataCreator.DataSets.DataSetGenStandard import DataSetGenStandard
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Test Data Set Generation") \
    .enableHiveSupport() \
    .master("local[*]") \
    .getOrCreate()

In [7]:
testSchema = sturct_col['ad_campaigns']
print(testSchema)

StructType([StructField('campaign_id', IntegerType(), False), StructField('company_id', IntegerType(), False), StructField('goals', StringType(), True), StructField('end_date', DateType(), True), StructField('status', StringType(), False), StructField('campaign_name', StringType(), False), StructField('start_date', DateType(), False), StructField('budget', FloatType(), False), StructField('target_audience', StringType(), True), StructField('created_at', TimestampType(), False), StructField('updated_at', TimestampType(), False)])


In [8]:
# Create First Name and Last Name column generators (need long term fix for this)
from src.DataCreator.DataGenerators.FirstNameData import FirstNameData
from src.DataCreator.DataGenerators.LastNameData import LastNameData

FirstNameGen = FirstNameData(spark, s_file_path=r'C:\Users\dalej\Documents\_Coding\DragonRegen\gitData\SSA_FirstNames_Stats')
LastNameGen = LastNameData(spark, s_file_path=r'C:\Users\dalej\Documents\_Coding\DragonRegen\gitData\census_surname_bounds.parquet')

In [9]:
FirstNameGen.df_first_names.show(5, False)
LastNameGen.df_last_names.show(5, False)

+----------+------------+-------------------+-------------------+
|first_name|overall_rank|profile_lower_bound|profile_upper_bound|
+----------+------------+-------------------+-------------------+
|Michael   |1           |0                  |3955521            |
|James     |2           |3955521            |6894384            |
|David     |3           |6894384            |9819819            |
|John      |4           |9819819            |12601194           |
|Robert    |5           |12601194           |15203120           |
+----------+------------+-------------------+-------------------+
only showing top 5 rows

+---------+-----------+-------------------+-------------------+
|last_name|unqiue_rank|profile_lower_bound|profile_upper_bound|
+---------+-----------+-------------------+-------------------+
|SMITH    |1          |0                  |82819              |
|JOHNSON  |2          |82819              |148342             |
|WILLIAMS |3          |148342             |203439            

In [10]:
data_gen = DataSetGenStandard(spark, testSchema, 100)
# Generate the DataFrame
# TODO: So things are a list some are DataFrames (First and Last Name)
# TODO: maybe have the generator detect or see and attribute that is labeled as DataFrame/appended to the schema
df = data_gen.generate_data()
print(type(df))
df.show(50, truncate=False)

==> Column Name:campaign_id
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: None
Checking subclass: ColBasic and requirements: <class 'src.DataCreator.ColGenerators.ColBasic.ColBasic'>
==> Column Name:company_id
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: None
Checking subclass: ColBasic and requirements: <class 'src.DataCreator.ColGenerators.ColBasic.ColBasic'>
==> Column Name:goals
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: <class 'src.DataCreator.ColGenerators.StringBasic.StringBasic'>
==> Column Name:end_date
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: None
Checking subclass: ColBasic and requirements: <class 'src.DataCreator.ColGenerators.ColBasic.ColBasic'>
==> Column Name:status
Nothing found
Ch