In [1]:
import os
import sys
import json

notebook_path = os.getcwd()
project_path = os.path.abspath(os.path.join(notebook_path, '..','..'))
print(f"Project path: {project_path}")
# Add the project path to sys.path
sys.path.append(project_path)

Project path: c:\Users\dalej\Documents\_Coding\DragonRegen


## Load Data Schema

In [2]:
# load the schema from the JSON file
output_file_path = os.path.join(notebook_path, 'output_schema.json')
with open(output_file_path, 'r') as output_file:
    loaded_schema = json.load(output_file)

print(loaded_schema.keys())
# print(json.dumps(loaded_schema['customers'], indent=2))

dict_keys(['customers', 'vehicles', 'drivers', 'driving_history', 'coverage_options', 'quotes', 'quote_coverages', 'discount_types', 'quote_discounts'])


In [3]:
fields = loaded_schema['customers']['fields']
for field in fields:
    if field['name'] in ['customer_id', 'first_name', 'zip_code']:
        print(json.dumps(field, indent=2))

{
  "name": "customer_id",
  "type": "Integer",
  "nullable": false,
  "metadata": {
    "description": "unique ID, primary key for customer identification",
    "unique_fl": true,
    "default_value": null,
    "col_type": "Unique_Identifier"
  }
}
{
  "name": "first_name",
  "type": "String",
  "nullable": false,
  "metadata": {
    "description": "customer's first name",
    "unique_fl": false,
    "default_value": null,
    "col_type": "StringFirstName"
  }
}
{
  "name": "zip_code",
  "type": "String",
  "nullable": false,
  "metadata": {
    "description": "Customer's postal code used for location-based insurance quote calculations and regional risk assessment.",
    "unique_fl": "False",
    "column_values": [
      "10001",
      "90210",
      "60601",
      "02108",
      "33101"
    ],
    "col_type": "Categorical"
  }
}


In [4]:
from src.DataCreator.SchemaGenerators.SchemaSpark import SchemaSpark

# create spark and t-sql schemas with in there engines
sturct_col = SchemaSpark.generate_schema(loaded_schema)
print(loaded_schema.keys())
print(sturct_col.keys())
print('=='*20)
print(sturct_col['customers'])


dict_keys(['customers', 'vehicles', 'drivers', 'driving_history', 'coverage_options', 'quotes', 'quote_coverages', 'discount_types', 'quote_discounts'])
dict_keys(['customers', 'vehicles', 'drivers', 'driving_history', 'coverage_options', 'quotes', 'quote_coverages', 'discount_types', 'quote_discounts'])
StructType([StructField('customer_id', IntegerType(), False), StructField('first_name', StringType(), False), StructField('last_name', StringType(), False), StructField('email', StringType(), False), StructField('phone_number', StringType(), False), StructField('date_of_birth', DateType(), False), StructField('address_line_1', StringType(), False), StructField('address_line_2', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), False), StructField('zip_code', StringType(), False), StructField('created_at', TimestampType(), False), StructField('updated_at', TimestampType(), True)])


In [5]:
# Confirm Metadata is being passed correctly
print(sturct_col['customers']['customer_id'])
col_ex = sturct_col['customers']['customer_id']
print(col_ex.metadata)

StructField('customer_id', IntegerType(), False)
{'description': 'unique ID, primary key for customer identification', 'unique_fl': True, 'default_value': None, 'col_type': 'Unique_Identifier'}


## Create DataFrames

In [6]:
from src.DataCreator.DataSets.DataSetGenStandard import DataSetGenStandard
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Test Data Set Generation") \
    .enableHiveSupport() \
    .master("local[*]") \
    .getOrCreate()

In [7]:
testSchema = sturct_col['customers']
print(testSchema)

StructType([StructField('customer_id', IntegerType(), False), StructField('first_name', StringType(), False), StructField('last_name', StringType(), False), StructField('email', StringType(), False), StructField('phone_number', StringType(), False), StructField('date_of_birth', DateType(), False), StructField('address_line_1', StringType(), False), StructField('address_line_2', StringType(), True), StructField('city', StringType(), True), StructField('state', StringType(), False), StructField('zip_code', StringType(), False), StructField('created_at', TimestampType(), False), StructField('updated_at', TimestampType(), True)])


In [8]:
# Create First Name and Last Name column generators (need long term fix for this)
from src.DataCreator.DataGenerators.FirstNameData import FirstNameData
from src.DataCreator.DataGenerators.LastNameData import LastNameData

FirstNameGen = FirstNameData(spark)
LastNameGen = LastNameData(spark)

In [9]:
FirstNameGen.df_first_names.show(5, False)
LastNameGen.df_last_names.show(5, False)

+----------+------------+-------------------+-------------------+
|first_name|overall_rank|profile_lower_bound|profile_upper_bound|
+----------+------------+-------------------+-------------------+
|Michael   |1           |0                  |3955521            |
|James     |2           |3955521            |6894384            |
|David     |3           |6894384            |9819819            |
|John      |4           |9819819            |12601194           |
|Robert    |5           |12601194           |15203120           |
+----------+------------+-------------------+-------------------+
only showing top 5 rows

+---------+-----------+-------------------+-------------------+
|last_name|unqiue_rank|profile_lower_bound|profile_upper_bound|
+---------+-----------+-------------------+-------------------+
|SMITH    |1          |0                  |82819              |
|JOHNSON  |2          |82819              |148342             |
|WILLIAMS |3          |148342             |203439            

In [10]:
data_gen = DataSetGenStandard(spark, testSchema, 100)
# Generate the DataFrame
# TODO: So things are a list some are DataFrames (First and Last Name)
# TODO: maybe have the generator detect or see and attribute that is labeled as DataFrame/appended to the schema
df = data_gen.generate_data()
print(type(df))
df.show(50, truncate=False)

==> Column Name:customer_id
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: None
Checking subclass: ColBasic and requirements: <class 'src.DataCreator.ColGenerators.ColBasic.ColBasic'>
==> Column Name:first_name
==> Column Name:last_name
==> Column Name:email
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: <class 'src.DataCreator.ColGenerators.StringBasic.StringBasic'>
==> Column Name:phone_number
==> Column Name:date_of_birth
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: None
Checking subclass: ColBasic and requirements: <class 'src.DataCreator.ColGenerators.ColBasic.ColBasic'>
==> Column Name:address_line_1
Nothing found
Checking subclass: Categorical and requirements: None
Checking subclass: StringBasic and requirements: <class 'src.DataCreator.ColGenerators.StringBasic.StringB