In [2]:
#input_path ="C:\\Users\\arpit\\Downloads\\dyson\\input_files"

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, current_timestamp
from pyspark.sql.functions import *

import yaml

In [40]:
# Function to read the configuration from config.yaml
def read_config():
    try:
        with open('config.yaml', 'r') as config_file:
            config = yaml.safe_load(config_file)
        return config
    except FileNotFoundError:
        print("Error: config.yaml file not found.")
        return None
    except yaml.YAMLError as e:
        print(f"Error loading config.yaml: {e}")
        return None

if __name__ == "__main__":
    # Read the configuration from config.yaml
    config = read_config()

    if config:
        # Access the input_path from the config
        input_path = config.get('input_path')
        output_path = config.get('output_path')

        # Print the input_path (you can use it in your PySpark pipeline)
        print(f"Input Path: {input_path}")
        print(f"Output Path: {output_path}")
    else:
        print("Configuration not loaded. Please check the YAML file and its location.")

    
    # Create a Spark session with the configured app name
    spark = SparkSession.builder.appName(config["spark"]["app_name"]).getOrCreate()



Input Path: /home/jovyan/work/digital/source
Output Path: /home/jovyan/work/digital/target


In [41]:
# Read the CSV file into a DataFrame
df_char = spark.read.csv(f"{input_path}/characters.csv", header=True, inferSchema=True)
df_char.show(5)

+-----------+---------------+
|characterID|           name|
+-----------+---------------+
|    1009220|Captain America|
|    1010740| Winter Soldier|
|    1009471|      Nick Fury|
|    1009552|   S.H.I.E.L.D.|
|    1009228|  Sharon Carter|
+-----------+---------------+
only showing top 5 rows



In [42]:
# Read the CSV file into a DataFrame
df_stats = spark.read.csv(f"{input_path}/characters_stats.csv", header=True, inferSchema=True)
df_stats = df_stats.withColumnRenamed("Name","name")
df_stats.show(5)

+-----------+---------+------------+--------+-----+----------+-----+------+-----+
|       name|Alignment|Intelligence|Strength|Speed|Durability|Power|Combat|Total|
+-----------+---------+------------+--------+-----+----------+-----+------+-----+
|    3-D Man|     good|          50|      31|   43|        32|   25|    52|  233|
|     A-Bomb|     good|          38|     100|   17|        80|   17|    64|  316|
| Abe Sapien|     good|          88|      14|   35|        42|   35|    85|  299|
|   Abin Sur|     good|          50|      90|   53|        64|   84|    65|  406|
|Abomination|      bad|          63|      80|   53|        90|   55|    95|  436|
+-----------+---------+------------+--------+-----+----------+-----+------+-----+
only showing top 5 rows



In [43]:
df_char_stats = df_char.join(df_stats, on="name", how="inner")
# Add audit columns
df_char_stats = df_char_stats.withColumn("batch_id", lit("101"))
df_char_stats = df_char_stats.withColumn("load_date", current_timestamp().cast("string"))
df_char_stats.show(5)


+---------------+-----------+---------+------------+--------+-----+----------+-----+------+-----+--------+--------------------+
|           name|characterID|Alignment|Intelligence|Strength|Speed|Durability|Power|Combat|Total|batch_id|           load_date|
+---------------+-----------+---------+------------+--------+-----+----------+-----+------+-----+--------+--------------------+
|Captain America|    1009220|     good|          63|      19|   35|        56|   46|   100|  319|     101|2023-09-16 05:46:...|
| Winter Soldier|    1010740|     good|          56|      32|   35|        65|   60|    84|  332|     101|2023-09-16 05:46:...|
|      Nick Fury|    1009471|     good|          75|      11|   23|        42|   25|   100|  276|     101|2023-09-16 05:46:...|
|       Punisher|    1009515|     good|          50|      16|   23|        28|   22|   100|  239|     101|2023-09-16 05:46:...|
|      Red Skull|    1009535|      bad|          75|      10|   12|        14|   19|    80|  210|     10

In [55]:
# Save the DataFrame to a Parquet file
dfp_char_stats = df_char_stats.toPandas()
try:
    df_char_stats.write.parquet(f"{output_path}/char_stats", mode="overwrite")
    # Print a message to confirm the file has been saved
    print(f"DataFrame saved to Parquet file: {output_path}")
except Exception as e:
    print(f"Error writing DataFrame to Parquet: {str(e)}")


DataFrame saved to Parquet file: /home/jovyan/work/digital/target


In [77]:
# Define the path to your SQL script
sql_script_path = "modelling.sql"

# Read and execute SQL statements from the script
with open(sql_script_path, "r") as script_file:
    sql_statements = script_file.read().split(";")  # Split statements by semicolon

    # Remove empty statements
    sql_statements = [statement.strip() for statement in sql_statements if statement.strip()]

    # Execute each SQL statement separately
    for statement in sql_statements:
        spark.sql(statement)
    # Save the SparkSession to a file
#spark.sparkContext.setCheckpointDir("path/to/checkpoint")  # Set checkpoint directory
#spark.save()
print("Table created Successfully")

Table created Successfully


In [78]:
spark.sql("describe formatted  db_sil_marvel.char_stats_day_dly").show(truncate=False)

+----------------------------+----------------------------+-------+
|col_name                    |data_type                   |comment|
+----------------------------+----------------------------+-------+
|name                        |string                      |null   |
|characterID                 |int                         |null   |
|Alignment                   |string                      |null   |
|Intelligence                |int                         |null   |
|Strength                    |int                         |null   |
|Speed                       |int                         |null   |
|Durability                  |int                         |null   |
|Power                       |int                         |null   |
|Combat                      |int                         |null   |
|Total                       |int                         |null   |
|batch_id                    |string                      |null   |
|load_date                   |string            

In [80]:
spark.sql("select * from db_sil_marvel.char_stats_day_dly").show(5,truncate=False)

+---------------+-----------+---------+------------+--------+-----+----------+-----+------+-----+--------+--------------------------+
|name           |characterID|Alignment|Intelligence|Strength|Speed|Durability|Power|Combat|Total|batch_id|load_date                 |
+---------------+-----------+---------+------------+--------+-----+----------+-----+------+-----+--------+--------------------------+
|Captain America|1009220    |good     |63          |19      |35   |56        |46   |100   |319  |101     |2023-09-16 05:52:14.072456|
|Winter Soldier |1010740    |good     |56          |32      |35   |65        |60   |84    |332  |101     |2023-09-16 05:52:14.072456|
|Nick Fury      |1009471    |good     |75          |11      |23   |42        |25   |100   |276  |101     |2023-09-16 05:52:14.072456|
|Punisher       |1009515    |good     |50          |16      |23   |28        |22   |100   |239  |101     |2023-09-16 05:52:14.072456|
|Red Skull      |1009535    |bad      |75          |10      |1

In [81]:
spark.sql('select count(1) total_heros,alignment from db_sil_marvel.char_stats_day_dly  group by 2;').show(truncate=False)

+-----------+---------+
|total_heros|alignment|
+-----------+---------+
|5          |neutral  |
|50         |bad      |
|143        |good     |
+-----------+---------+



In [2]:
pip install psycopg2-binary


Collecting psycopg2-binary
  Obtaining dependency information for psycopg2-binary from https://files.pythonhosted.org/packages/20/81/4940235d18747f865d47eb38b98f38acc24b39278b12e20a0fdd20e0a132/psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading psycopg2_binary-2.9.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install soda-sql

Collecting soda-sql
  Using cached soda_sql-2.0.0b27-py3-none-any.whl (84 kB)
Collecting asn1crypto==1.4.0 (from soda-sql)
  Using cached asn1crypto-1.4.0-py2.py3-none-any.whl (104 kB)
Collecting azure-common==1.1.25 (from soda-sql)
  Using cached azure_common-1.1.25-py2.py3-none-any.whl (12 kB)
Collecting azure-core==1.8.2 (from soda-sql)
  Using cached azure_core-1.8.2-py2.py3-none-any.whl (122 kB)
Collecting azure-storage-blob==12.5.0 (from soda-sql)
  Using cached azure_storage_blob-12.5.0-py2.py3-none-any.whl (326 kB)
Collecting boto3==1.15.18 (from soda-sql)
  Using cached boto3-1.15.18-py2.py3-none-any.whl (129 kB)
Collecting botocore==1.18.18 (from soda-sql)
  Using cached botocore-1.18.18-py2.py3-none-any.whl (6.7 MB)
Collecting cachetools==4.2.0 (from soda-sql)
  Using cached cachetools-4.2.0-py3-none-any.whl (12 kB)
Collecting certifi==2020.12.5 (from soda-sql)
  Using cached certifi-2020.12.5-py2.py3-none-any.whl (147 kB)
Collecting cffi==1.14.3 (from soda-sql)
  Using cach

In [1]:
pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [2]:
import unittest
from soda.spark import SparkDFAssert

class TestCharStats(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Load configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)
        
        # Create a Spark session with the configured app name
        #cls.spark = SparkSession.builder.appName(config["spark"]["app_name"]).getOrCreate()

    @classmethod
    def tearDownClass(cls):
        # Stop the Spark session
        cls.spark.stop()

    def test_table_structure(self):
        # Load configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)

        # Assert table schema and column names
        soda_assert = SparkDFAssert(self.spark)
        df = self.spark.table(config["database"]["table_name"])

        # Check if the table has the expected columns
        expected_columns = ["name", "characterID", "Alignment", "Intelligence", "Strength", "Speed", "Durability", "Power", "Combat", "Total", "batch_id", "load_date"]
        soda_assert.assertColumnNames(df, expected_columns)

        # Add more assertions on the table structure as needed
        # soda_assert.assertColumnType(...)
        # soda_assert.assertColumnNotNull(...)

    def test_data_quality(self):
        # Load configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)

        # Assert data quality checks (e.g., check for null values, data types, etc.)
        soda_assert = SparkDFAssert(self.spark)
        df = self.spark.table(config["database"]["table_name"])

        # Example: Check if the 'name' column does not contain null values
        soda_assert.assertColumnNotNull(df, "name")

        # Add more data quality checks as needed

    def test_data_consistency(self):
        # Load configuration from config.yaml
        with open("config.yaml", "r") as config_file:
            config = yaml.safe_load(config_file)

        # Assert data consistency checks (e.g., check for valid data values)
        soda_assert = SparkDFAssert(self.spark)
        df = self.spark.table(config["database"]["table_name"])

        # Example: Check if 'Total' column values are consistent with the sum of other attributes
        soda_assert.assertColumnEqualToSum(df, "Total", ["Intelligence", "Strength", "Speed", "Durability", "Power", "Combat"])

        # Add more data consistency checks as needed

if __name__ == '__main__':
    unittest.main()


ModuleNotFoundError: No module named 'soda.spark'

In [4]:
import unittest
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from soda import Context

class TestCharStats(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        # Create a Spark session
        cls.spark = SparkSession.builder.appName("TestCharStats").getOrCreate()

    @classmethod
    def tearDownClass(cls):
        # Stop the Spark session
        cls.spark.stop()

    def test_data_quality(self):
        # Load your DataFrame
        df = self.spark.read.parquet("path_to_parquet_file")  # Replace with the actual path

        # Create a Soda Context
        context = Context(self.spark)

        # Define data tests using Soda's DSL
        tests = context.create_test_suite()
        tests.expect_column_values_to_be_between("Intelligence", min_value=0, max_value=100)
        tests.expect_column_values_to_be_between("Strength", min_value=0, max_value=100)
        tests.expect_column_values_to_be_between("Speed", min_value=0, max_value=100)

        # Run the data tests
        results = tests.run(df)

        # Check the test results
        self.assertTrue(results["success"])

if __name__ == '__main__':
    unittest.main()


[31mERROR: Could not find a version that satisfies the requirement soda-pyspark (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for soda-pyspark[0m[31m
[0m