In [1]:
# import library
import pyspark

In [2]:
# check pyspark version
pyspark.__version__

'3.5.0'

In [3]:
# import SparkSession
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession \
    .builder \
    .appName("Final Project PySpark") \
    .getOrCreate()

In [5]:
spark

## Load and Handle Failure Data

In [6]:
from dotenv import load_dotenv
import os
import pandas as pd

In [7]:
load_dotenv(".env", override=True)

True

In [8]:
ACCESS_KEY_MINIO = os.getenv("ACCESS_KEY_MINIO")
SECRET_KEY_MINIO = os.getenv("SECRET_KEY_MINIO")

In [9]:
!pip install minio



In [10]:
#The Minio libray is used to interact with a MinIO server. 
from minio import Minio

# BytesIO provides a way to work with binary data in memory as if it were a file.
from io import BytesIO

In [11]:
def handle_error(data, bucket_name:str, table_name:str):

    current_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    # Initialize MinIO client
    client = Minio('localhost:9000',
                access_key=ACCESS_KEY_MINIO,
                secret_key=SECRET_KEY_MINIO,
                secure=False)

    # Make a bucket if it doesn't exist
    if not client.bucket_exists(bucket_name):
        client.make_bucket(bucket_name)

    # Convert DataFrame to CSV and then to bytes
    csv_bytes = data.to_csv().encode('utf-8')
    csv_buffer = BytesIO(csv_bytes)

    # Upload the CSV file to the bucket
    client.put_object(
        bucket_name=bucket_name,
        object_name=f"{table_name}_{current_date}.csv", #name the fail source name and current etl date
        data=csv_buffer,
        length=len(csv_bytes),
        content_type='application/csv'
    )

    # List objects in the bucket
    objects = client.list_objects(bucket_name, recursive=True)
    for obj in objects:
        print(obj.object_name)

In [12]:
import csv
from datetime import datetime
import json

In [13]:
# INIT LOGS
def log_to_csv(log_msg: dict, filename: str):
    # Ensure the 'logs' directory exists
    log_dir = os.path.join(os.getcwd(), 'logs')
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    # Create the full file path inside 'logs'
    file_path = os.path.join(log_dir, filename)

    # Define the column headers
    headers = ["step", "status", "source", "table_name", "etl_date"]

    try:
        # Check if the file exists
        file_exists = os.path.isfile(file_path)

        with open(file_path, mode='a', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=headers)

            # Write the header only if the file doesn't exist
            if not file_exists:
                writer.writeheader()

            # Append the log message
            writer.writerow(log_msg)

        print(f"Log written to {file_path}")
    
    except Exception as e:
        print(f"Error writing log to {file_path}: {e}")

## Extract

### CSV Files

In [14]:
def extract_csv(file_path, table_name):
    try:
        # Read CSV using Spark
        df = spark.read.option("header", "true").csv(file_path)

        # Show extracted data
        df.show()

        # Log success
        log_to_csv({
            "step": "Extract",
            "status": "Success",
            "source": "CSV",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }, "etl_log.csv")

        return df
    
    except Exception as e:
        # Log failure
        log_to_csv({
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "CSV",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }, "etl_log.csv")

        print(f"Error extracting {file_path}: {e}")
        return None

In [15]:
# Call the extract function
df_people = extract_csv("data/people.csv", "people_data")
df_people.show()

+---------+---------+----------+----------+--------------------+--------------------+
|people_id|object_id|first_name| last_name|          birthplace|    affiliation_name|
+---------+---------+----------+----------+--------------------+--------------------+
|        1|      p:2|       Ben|   Elowitz|                NULL|           Blue Nile|
|        2|      p:3|     Kevin|  Flaherty|                NULL|            Wetpaint|
|        3|      p:4|      Raju|   Vegesna|                NULL|                Zoho|
|        4|      p:5|       Ian|     Wenig|                NULL|                Zoho|
|        5|      p:6|     Kevin|      Rose|         Redding, CA|        i/o Ventures|
|        6|      p:7|       Jay|   Adelson|         Detroit, MI|                Digg|
|        7|      p:8|      Owen|     Byrne|                NULL|                Digg|
|        8|      p:9|       Ron|Gorodetzky|                NULL|                Digg|
|        9|     p:10|      Mark|Zuckerberg|           

In [16]:
# Call the extract function
df_relations = extract_csv("data/relationships.csv", "relationships_data")
df_relations.show()

+---------------+----------------+----------------------+--------------------+--------------------+-------+--------+--------------------+--------------------+--------------------+
|relationship_id|person_object_id|relationship_object_id|            start_at|              end_at|is_past|sequence|               title|          created_at|          updated_at|
+---------------+----------------+----------------------+--------------------+--------------------+-------+--------+--------------------+--------------------+--------------------+
|              1|             p:2|                   c:1|                NULL|                NULL|  false|       8|Co-Founder/CEO/Bo...|2007-05-25 07:03:...|2013-06-03 09:58:...|
|              2|             p:3|                   c:1|                NULL|                NULL|  false|  279242|        VP Marketing|2007-05-25 07:04:...|2010-05-21 16:31:...|
|              3|             p:4|                   c:3|                NULL|                NULL| 

### Database

In [17]:
# set variable for database

DB_URL = os.getenv("DB_URL")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")

In [18]:
def extract_from_db():
    try:
        # Get list of tables from the database
        table_list = spark.read \
            .format("jdbc") \
            .option("url", DB_URL) \
            .option("dbtable", "information_schema.tables") \
            .option("user", DB_USER) \
            .option("password", DB_PASS) \
            .option("driver", "org.postgresql.Driver") \
            .load() \
            .filter("table_schema = 'public'") \
            .select("table_name") \
            .rdd.flatMap(lambda x: x).collect()

        print(f"Found tables: {table_list}")

        tables = {}
        for table in table_list:
            try:
                # Read each table into a DataFrame
                df = spark.read \
                    .format("jdbc") \
                    .option("url", DB_URL) \
                    .option("dbtable", table) \
                    .option("user", DB_USER) \
                    .option("password", DB_PASS) \
                    .option("driver", "org.postgresql.Driver") \
                    .load()

                tables[table] = df

                # Log success for each table
                log_to_csv({
                    "step": "Extract",
                    "status": "Success",
                    "source": "PostgreSQL",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }, "etl_log.csv")

                print(f"Successfully extracted table: {table}")

            except Exception as e:
                # Log failure for specific table
                log_to_csv({
                    "step": "Extract",
                    "status": f"Failed: {e}",
                    "source": "PostgreSQL",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }, "etl_log.csv")
                print(f"Failed to extract table: {table} - Error: {e}")

        return tables
    
    except Exception as e:
        # Log failure for the whole extraction process
        log_to_csv({
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "PostgreSQL",
            "table_name": "N/A",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }, "etl_log.csv")
        print(f"Failed to extract tables: {e}")
        return {}

In [19]:
# Extract all tables
tables = extract_from_db()
print(f"Extracted tables: {list(tables.keys())}")

Found tables: ['company', 'acquisition', 'funding_rounds', 'funds', 'investments', 'ipos']
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: company
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: acquisition
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funding_rounds
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funds
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: investments
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: ipos
Extracted tables: ['company', 'acquisition', 'funding_rounds', 'funds', 'investments', 'ipos']


In [20]:
# Read each table
df_acquisition = tables["acquisition"]
df_company = tables["company"]
df_funding_rounds = tables["funding_rounds"]
df_funds = tables["funds"]
df_investments = tables["investments"]
df_ipos = tables["ipos"]

# check
df_company.show()

+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|office_id|object_id|      description|              region|            address1|  address2|          city|  zip_code|state_code|country_code| latitude|  longitude|         created_at|         updated_at|
+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|        8|      c:8|                 |              SF Bay|959 Skyway Road, ...|          |    San Carlos|     94070|        CA|         USA|37.506885|-122.247573|2007-01-01 22:19:54|2007-01-01 22:19:54|
|        9|      c:9|     Headquarters|         Los Angeles|9229 W. Sunset Blvd.|          |West Hollywood|     90069|        CA|         USA|34.090368|-118.393064|2007-01-01 22:19

### From API

In [21]:
import requests

In [22]:
def extract_api(link_api: str, list_parameter: dict, data_name: str):
    try:
        # Establish connection to API
        resp = requests.get(link_api, params=list_parameter)
        resp.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the response JSON
        raw_response = resp.json()

        # Convert JSON data to pandas DataFrame
        df_api = pd.DataFrame(raw_response)

        if df_api.empty:
            raise ValueError("Empty response from API")

        # Convert pandas DataFrame to PySpark DataFrame
        spark_df = spark.createDataFrame(df_api)

        # Log success
        log_msg = {
            "step": "Extract",
            "status": "Success",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_csv(log_msg, "etl_log.csv")

        print(f"Successfully extracted data from API: {data_name}")
        return spark_df

    except requests.exceptions.RequestException as e:
        # Log request failure
        log_msg = {
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_csv(log_msg, "etl_log.csv")
        print(f"Request failed: {e}")

    except ValueError as e:
        # Log parsing failure
        log_msg = {
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_csv(log_msg, "etl_log.csv")
        print(f"Parsing error: {e}")

    except Exception as e:
        # Catch any other errors
        log_msg = {
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_csv(log_msg, "etl_log.csv")
        print(f"An error occurred: {e}")

    return None

In [23]:
# Extract data from 2008 - 2010
link_api = "https://api-milestones.vercel.app/api/data"
list_parameter = {
    "start_date": "2008-01-01",
    "end_date": "2010-12-31"
}

df_milestones = extract_api(link_api, list_parameter, "milestones")
df_milestones.show()

Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted data from API: milestones
+--------------------+--------------------+------------+--------------+------------+---------+--------------------+--------------------+--------------------+
|          created_at|         description|milestone_at|milestone_code|milestone_id|object_id|  source_description|          source_url|          updated_at|
+--------------------+--------------------+------------+--------------+------------+---------+--------------------+--------------------+--------------------+
|2008-06-18 08:14:...|Survives iPhone 3...|  2008-06-09|         other|           1|     c:12|Twitter Fails To ...|http://www.techcr...|2008-06-18 08:14:...|
|2008-06-18 08:50:...|More than 4 Billi...|  2008-06-18|         other|           3|     c:59|11 Billion Videos...|http://www.comsco...|2008-06-18 08:50:...|
|2008-06-19 04:14:...|Reddit goes Open ...|  2008-06-18|         other|           4|    c:314|reddit goes open 

## Load - Staging 

In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException

In [25]:
!pip install pangres



In [26]:
# set variable for database

DB_STAGING_URL = os.getenv("DB_STAGING_URL")
DB_STAGING_USER = os.getenv("DB_STAGING_USER")
DB_STAGING_PASS = os.getenv("DB_STAGING_PASS")

In [33]:
from pyspark.sql.utils import AnalysisException
from pangres import upsert
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime

def load_staging2(df, table_name, mode="overwrite", use_upsert=False, idx_name=None, schema=None, source=None):
    try:
        if use_upsert:
            # Convert Spark DataFrame ke Pandas DataFrame
            data = df.toPandas()

            # Create connection ke PostgreSQL
            conn = create_engine(f"postgresql://{DB_STAGING_USER}:{DB_STAGING_PASS}@host.docker.internal:5432/pyspark_task_staging")

            # Set index untuk upsert (kalau nggak ada, kasih warning)
            if idx_name is None:
                raise ValueError("Index name is required for upsert mode")

            data = data.set_index(idx_name)

            # Lakukan upsert (insert kalau belum ada, update kalau sudah ada)
            upsert(
                con=conn,
                df=data,
                table_name=table_name,
                schema=schema,
                if_row_exists="update"
            )
            print(f"Data upserted to table '{table_name}' successfully!")
        else:
            # Load dengan Spark
            df.write \
                .format("jdbc") \
                .option("url", "jdbc:postgresql://host.docker.internal:5432/pyspark_task_staging") \
                .option("dbtable", table_name) \
                .option("user", DB_STAGING_USER) \
                .option("password", DB_STAGING_PASS) \
                .option("driver", "org.postgresql.Driver") \
                .mode(mode) \
                .save()

            print(f"Data loaded to table '{table_name}' successfully!")

        # Buat success log
        log_msg = {
            "step": "Load Staging",
            "status": "Success",
            "source": "Staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

    except Exception as e:
        print(f"Error loading data to table '{table_name}': {e}")

        # Buat DataFrame gagal untuk logging
        failed_data = df.toPandas() if not use_upsert else data
        failed_data['error_message'] = str(e)
        failed_data['etl_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Buat log gagal
        log_msg = {
            "step": "Load Staging",
            "status": "Failed",
            "source": "Staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "error_message": str(e)
        }

        # Save failed data ke CSV
        failed_log_path = f'logs/failed_{table_name}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        failed_data.to_csv(failed_log_path, index=False)
        print(f"Failed data saved to: {failed_log_path}")

    finally:
        # Hapus error_message sebelum simpan ke log
        if 'error_message' in log_msg:
            del log_msg['error_message']

        # Simpan log ke CSV
        log_to_csv(log_msg, 'etl_log.csv')

    return df if not use_upsert else data


In [34]:
# from API
load_staging2(df_milestones, "milestones")

Data loaded to table 'milestones' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv


DataFrame[created_at: string, description: string, milestone_at: string, milestone_code: string, milestone_id: bigint, object_id: string, source_description: string, source_url: string, updated_at: string]

In [35]:
# from csv
load_staging2(df_relations, "relationship")
load_staging2(df_people, "people", mode="overwrite")

Data loaded to table 'relationship' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'people' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv


DataFrame[people_id: string, object_id: string, first_name: string, last_name: string, birthplace: string, affiliation_name: string]

In [36]:
# from database

load_staging2(df_acquisition, "acquisition") 
load_staging2(df_funding_rounds, "funding_rounds") 
load_staging2(df_funds, "funds")
load_staging2(df_investments, "investments")
load_staging2(df_ipos, "ipo")
load_staging2(df_company, "company") 

Data loaded to table 'acquisition' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'funding_rounds' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'funds' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'investments' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'ipo' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'company' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv


DataFrame[office_id: int, object_id: string, description: string, region: string, address1: string, address2: string, city: string, zip_code: string, state_code: string, country_code: string, latitude: decimal(9,6), longitude: decimal(9,6), created_at: timestamp, updated_at: timestamp]

## Extract Data from Staging

In [63]:
spark.catalog.clearCache()

In [64]:
# set variable for staging

DB_STAGING_URL = os.getenv("DB_STAGING_URL")
DB_STAGING_USER = os.getenv("DB_STAGING_USER")
DB_STAGING_PASS = os.getenv("DB_STAGING_PASS")

In [65]:
def extract_from_staging():
    try:
        # Get list of tables from staging
        table_list = spark.read \
            .format("jdbc") \
            .option("url", DB_STAGING_URL) \
            .option("dbtable", "(SELECT table_name FROM information_schema.tables WHERE table_schema = 'public') AS tbl") \
            .option("user", DB_STAGING_USER) \
            .option("password", DB_STAGING_PASS) \
            .option("driver", "org.postgresql.Driver") \
            .load() \
            .select("table_name") \
            .rdd.flatMap(lambda x: x).collect()

        print(f"Found tables in staging: {table_list}")

        tables = {}
        for table in table_list:
            try:
                # Read each table into a DataFrame
                df = spark.read \
                    .format("jdbc") \
                    .option("url", DB_STAGING_URL) \
                    .option("dbtable", table) \
                    .option("user", DB_STAGING_USER) \
                    .option("password", DB_STAGING_PASS) \
                    .option("driver", "org.postgresql.Driver") \
                    .load()

                tables[table] = df

                # Log success for each table
                log_to_csv({
                    "step": "Extract",
                    "status": "Success",
                    "source": "PostgreSQL (Staging)",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }, "etl_log.csv")

                print(f"Successfully extracted table: {table}")

            except Exception as e:
                # Log failure for specific table
                log_to_csv({
                    "step": "Extract",
                    "status": f"Failed: {e}",
                    "source": "PostgreSQL (Staging)",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }, "etl_log.csv")
                print(f"Failed to extract table: {table} - Error: {e}")

        return tables
    
    except Exception as e:
        # Log failure for the whole extraction process
        log_to_csv({
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "PostgreSQL (Staging)",
            "table_name": "N/A",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }, "etl_log.csv")
        print(f"Failed to extract tables: {e}")
        return {}

In [66]:
# Extract all tables from staging
data = extract_from_staging()
print(f"Extracted tables: {list(data.keys())}")

Found tables in staging: ['investments', 'relationship', 'people', 'ipo', 'company', 'acquisition', 'funding_rounds', 'milestones', 'funds']
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: investments
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: relationship
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: people
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: ipo
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: company
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: acquisition
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funding_rounds
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: milestones
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funds
Extracted tables: ['investments', 'relationship', '

In [68]:
# Read All data from staging
acquisition = data["acquisition"]
company = data["company"]
funding_rounds = data["funding_rounds"]
funds = data["funds"]
investments = data["investments"]
ipos = data["ipo"]
milestones = data["milestones"]
people = data["people"]
relationship = data["relationship"]

# check
company.show()

+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|office_id|object_id|      description|              region|            address1|  address2|          city|  zip_code|state_code|country_code| latitude|  longitude|         created_at|         updated_at|
+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|        8|      c:8|                 |              SF Bay|959 Skyway Road, ...|          |    San Carlos|     94070|        CA|         USA|37.506885|-122.247573|2007-01-01 22:19:54|2007-01-01 22:19:54|
|        9|      c:9|     Headquarters|         Los Angeles|9229 W. Sunset Blvd.|          |West Hollywood|     90069|        CA|         USA|34.090368|-118.393064|2007-01-01 22:19

## Data Checking

### Data Profiling

In [75]:
import os
import json
from datetime import datetime, date
from decimal import Decimal

# Helper function buat konversi tipe data ke JSON-compatible
def convert_to_serializable(obj):
    if isinstance(obj, Decimal):
        return float(obj)
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    return obj

def profile_data(person, df, table_name, format_file):
    try:
        n_rows = df.count()
        n_cols = len(df.columns)
        
        column_info = {}
        for col in df.columns:
            data_type = df.schema[col].dataType.simpleString()
            sample_values = df.select(col).distinct().limit(5).rdd.flatMap(lambda x: x).collect()
            null_count = df.filter(df[col].isNull()).count()
            unique_count = df.select(col).distinct().count()
            
            # Min and max values (if numeric or date type)
            try:
                min_value = df.agg({col: "min"}).collect()[0][0]
                max_value = df.agg({col: "max"}).collect()[0][0]
            except:
                min_value = None
                max_value = None
            
            # Persentase missing value
            percentage_missing = round((null_count / n_rows) * 100, 2) if n_rows > 0 else 0.0
            
            # Ambil 5 nilai unik sebagai sampel
            unique_values = df.select(col).distinct().limit(5).rdd.flatMap(lambda x: x).collect()
            
            # Persentase valid date (khusus untuk tipe date dan datetime)
            percentage_valid_date = None
            if data_type in ['date', 'timestamp']:
                valid_date_count = df.filter(df[col].isNotNull()).count()
                percentage_valid_date = round((valid_date_count / n_rows) * 100, 2) if n_rows > 0 else 0.0

            column_info[col] = {
                "data_type": data_type,
                "sample_values": [convert_to_serializable(v) for v in sample_values] if sample_values else None,
                "unique_count": unique_count,
                "unique_value": [convert_to_serializable(v) for v in unique_values] if unique_values else None,
                "null_count": null_count,
                "percentage_missing_value": percentage_missing,
                "min_value": convert_to_serializable(min_value),
                "max_value": convert_to_serializable(max_value),
                "percentage_valid_date": percentage_valid_date
            }
        
        dict_profiling = {
            "created_at": datetime.now().isoformat(),
            "person_in_charge": person,
            "profiling_result": {
                "table_name": table_name,
                "format_file": format_file,
                "n_rows": n_rows,
                "n_cols": n_cols,
                "report": column_info
            }
        }
        
        # Save profiling result to JSON
        folder_path = "data_profiling"
        os.makedirs(folder_path, exist_ok=True)

        file_path = os.path.join(folder_path, f"{table_name}_profiling.json")
        with open(file_path, "w") as f:
            json.dump(dict_profiling, f, indent=4, default=convert_to_serializable)

        print(f"Profiling saved to: {file_path}")

        # Create success log message
        log_msg = {
            "step": "Profiling",
            "status": "Success",
            "source": format_file,
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
        }

    except Exception as e:
        print(f"Error profiling table {table_name}: {e}")

        # Create fail log message
        log_msg = {
            "step": "Profiling",
            "status": f"Failed: {e}",
            "source": format_file,
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
        }

    finally:
        # Save log to CSV
        log_to_csv(log_msg, "etl_log.csv")

    return dict_profiling if 'dict_profiling' in locals() else None


In [76]:
# test 1
profiling_result = profile_data("Mr. A", people, "people_data", "from Staging")
print(json.dumps(profiling_result, indent=2))

Profiling saved to: data_profiling/people_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
{
  "created_at": "2025-03-22T09:50:39.336805",
  "person_in_charge": "Mr. A",
  "profiling_result": {
    "table_name": "people_data",
    "format_file": "from Staging",
    "n_rows": 226709,
    "n_cols": 6,
    "report": {
      "people_id": {
        "data_type": "string",
        "sample_values": [
          "177264",
          "296",
          "91421",
          "467",
          "177595"
        ],
        "unique_count": 226709,
        "unique_value": [
          "177264",
          "296",
          "91421",
          "467",
          "177595"
        ],
        "null_count": 0,
        "percentage_missing_value": 0.0,
        "min_value": "1",
        "max_value": "99999",
        "percentage_valid_date": null
      },
      "object_id": {
        "data_type": "string",
        "sample_values": [
          "p:105829",
          "p:73",
          "p:171",
          "p

In [77]:
profile_data("Mr. CCC", relationship, "relationship_data", "from Staging")
profile_data("Mrs. H", acquisition, "acquisition_data", "from Staging")
profile_data("Mrs. OP", company, "company_data", "from Staging")
profile_data("Mr. CCC", funding_rounds, "funding_rounds_data", "from Staging")
profile_data("Mr. A", funds, "funds_data", "from Staging")
profile_data("Mrs. H", investments, "investments_data", "from Staging")
profile_data("Mr. A", ipos, "ipos_data", "from Staging")
profile_data("Mrs. OP", milestones, "milestones_data", "from Staging")

Profiling saved to: data_profiling/relationship_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/acquisition_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/company_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/funding_rounds_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/funds_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/investments_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/ipos_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/milestones_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv


{'created_at': '2025-03-22T09:53:41.820808',
 'person_in_charge': 'Mrs. OP',
 'profiling_result': {'table_name': 'milestones_data',
  'format_file': 'from Staging',
  'n_rows': 8152,
  'n_cols': 9,
  'report': {'created_at': {'data_type': 'string',
    'sample_values': ['2010-09-30 04:46:05.000',
     '2010-10-04 23:53:31.000',
     '2010-05-26 23:08:38.000',
     '2010-07-09 05:01:33.000',
     '2010-07-10 12:53:28.000'],
    'unique_count': 7504,
    'unique_value': ['2010-09-30 04:46:05.000',
     '2010-10-04 23:53:31.000',
     '2010-05-26 23:08:38.000',
     '2010-07-09 05:01:33.000',
     '2010-07-10 12:53:28.000'],
    'null_count': 0,
    'percentage_missing_value': 0.0,
    'min_value': '2008-06-18 08:14:06.000',
    'max_value': '2013-12-10 20:15:30.000',
    'percentage_valid_date': None},
   'description': {'data_type': 'string',
    'sample_values': ["Viewfinity named in 'Hottest Boston Companies' List",
     'Centralway invested in \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n