In [1]:
# import library
import pyspark

In [2]:
# check pyspark version
pyspark.__version__

'3.5.0'

In [3]:
# import SparkSession
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession \
    .builder \
    .appName("Final Project PySpark") \
    .getOrCreate()

In [5]:
spark

## Load and Handle Failure Data

In [6]:
from dotenv import load_dotenv
import os
import pandas as pd

In [7]:
load_dotenv(".env", override=True)

True

In [8]:
import csv
from datetime import datetime
import json
from sqlalchemy import create_engine, Table, MetaData
from datetime import datetime

In [9]:
# Setup Connection to db logger

DB_LOGGER_URL = os.getenv("DB_LOGGER_URL")
DB_LOGGER_USER = os.getenv("DB_LOGGER_USER")
DB_LOGGER_PASS = os.getenv("DB_LOGGER_PASS")

In [10]:
from sqlalchemy import create_engine, Table, MetaData
from datetime import datetime

def log_to_db(log_msg: dict):
    try:
        # Setup connection to PostgreSQL
        engine = create_engine(f"postgresql://{DB_LOGGER_USER}:{DB_LOGGER_PASS}@host.docker.internal:5432/pyspark_task_logger")
        metadata = MetaData()
        metadata.reflect(bind=engine)

        # Take table object
        log_table = metadata.tables.get("etl_logs")
        if log_table is None:
            raise Exception("Table 'etl_logs' not found in database")

        # Add timestamp if not exists
        if "etl_date" not in log_msg:
            log_msg["etl_date"] = datetime.now()

        # Use transaction (auto commit)
        with engine.begin() as conn:
            conn.execute(log_table.insert().values(**log_msg))

        print("Log successfully written to database")

    except Exception as e:
        print(f"Error writing log to database: {e}")


In [11]:
# test connection
from sqlalchemy import create_engine, text

engine = create_engine(f"postgresql://{DB_LOGGER_USER}:{DB_LOGGER_PASS}@host.docker.internal:5432/pyspark_task_logger")
conn = engine.connect()

result = conn.execute(text("SELECT COUNT(*) FROM etl_logs"))
print(result.fetchone())


(740,)


In [12]:
import pandas as pd
from sqlalchemy import create_engine
from datetime import datetime
import os

def save_invalid_ids(invalid_ids, table_name):
    if not invalid_ids:
        print(f"No invalid IDs to save from table '{table_name}'.")
        return

    try:
        # Normalize jumlah kolom menjadi maksimal 6 kolom (3 pasang)
        max_cols = 6  # 3 pasang: entity/object
        if len(invalid_ids[0]) > max_cols:
            raise ValueError("Too many columns in invalid_ids input")

        # Buat DataFrame dengan nama kolom fleksibel
        base_columns = [
            "entity_type", "object_id",
            "extra_entity_type_1", "extra_object_id_1",
        ]
        df = pd.DataFrame(invalid_ids, columns=base_columns[:len(invalid_ids[0])])
        df["table_name"] = table_name
        df["logged_at"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Save ke DB
        engine = create_engine(f"postgresql://{DB_LOGGER_USER}:{DB_LOGGER_PASS}@host.docker.internal:5432/pyspark_task_logger")
        with engine.begin() as conn:
            df.to_sql("invalid_ids", con=conn, index=False, if_exists="append")
            print(f"{len(df)} invalid IDs from table '{table_name}' saved to logger DB.")

    except Exception as e:
        print(f"Error saving invalid IDs to logger DB: {e}")


## Extract

### CSV Files

In [None]:
def extract_csv(file_path, table_name):
    try:
        # Read CSV using Spark
        df = spark.read.option("header", "true").csv(file_path)

        # Show extracted data
        df.show()

        # Log success
        log_to_db({
            "step": "Extract",
            "status": "Success",
            "source": "CSV",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        return df
    
    except Exception as e:
        # Log failure
        log_to_db({
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "CSV",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        print(f"Error extracting {file_path}: {e}")
        return None

In [15]:
# Extract from CSV

df_people = extract_csv("data/people.csv", "people_data")
df_people.show()

+---------+---------+----------+----------+--------------------+--------------------+
|people_id|object_id|first_name| last_name|          birthplace|    affiliation_name|
+---------+---------+----------+----------+--------------------+--------------------+
|        1|      p:2|       Ben|   Elowitz|                NULL|           Blue Nile|
|        2|      p:3|     Kevin|  Flaherty|                NULL|            Wetpaint|
|        3|      p:4|      Raju|   Vegesna|                NULL|                Zoho|
|        4|      p:5|       Ian|     Wenig|                NULL|                Zoho|
|        5|      p:6|     Kevin|      Rose|         Redding, CA|        i/o Ventures|
|        6|      p:7|       Jay|   Adelson|         Detroit, MI|                Digg|
|        7|      p:8|      Owen|     Byrne|                NULL|                Digg|
|        8|      p:9|       Ron|Gorodetzky|                NULL|                Digg|
|        9|     p:10|      Mark|Zuckerberg|           

In [16]:
# Extract from CSV

df_relations = extract_csv("data/relationships.csv", "relationships_data")
df_relations.show()

+---------------+----------------+----------------------+--------------------+--------------------+-------+--------+--------------------+--------------------+--------------------+
|relationship_id|person_object_id|relationship_object_id|            start_at|              end_at|is_past|sequence|               title|          created_at|          updated_at|
+---------------+----------------+----------------------+--------------------+--------------------+-------+--------+--------------------+--------------------+--------------------+
|              1|             p:2|                   c:1|                NULL|                NULL|  false|       8|Co-Founder/CEO/Bo...|2007-05-25 07:03:...|2013-06-03 09:58:...|
|              2|             p:3|                   c:1|                NULL|                NULL|  false|  279242|        VP Marketing|2007-05-25 07:04:...|2010-05-21 16:31:...|
|              3|             p:4|                   c:3|                NULL|                NULL| 

### Database

In [10]:
# Set Variable for Database (data source)

DB_URL = os.getenv("DB_URL")
DB_USER = os.getenv("DB_USER")
DB_PASS = os.getenv("DB_PASS")

In [None]:
def extract_from_db():
    try:
        # Get list of tables from the database
        table_list = spark.read \
            .format("jdbc") \
            .option("url", DB_URL) \
            .option("dbtable", "information_schema.tables") \
            .option("user", DB_USER) \
            .option("password", DB_PASS) \
            .option("driver", "org.postgresql.Driver") \
            .load() \
            .filter("table_schema = 'public'") \
            .select("table_name") \
            .rdd.flatMap(lambda x: x).collect()

        print(f"Found tables: {table_list}")

        tables = {}
        for table in table_list:
            try:
                # Read each table into a DataFrame
                df = spark.read \
                    .format("jdbc") \
                    .option("url", DB_URL) \
                    .option("dbtable", table) \
                    .option("user", DB_USER) \
                    .option("password", DB_PASS) \
                    .option("driver", "org.postgresql.Driver") \
                    .load()

                tables[table] = df

                # Log success for each table
                log_to_db({
                    "step": "Extract",
                    "status": "Success",
                    "source": "PostgreSQL",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

                print(f"Successfully extracted table: {table}")

            except Exception as e:
                # Log failure for specific table
                log_to_db({
                    "step": "Extract",
                    "status": f"Failed: {e}",
                    "source": "PostgreSQL",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"Failed to extract table: {table} - Error: {e}")

        return tables
    
    except Exception as e:
        # Log failure for the whole extraction process
        log_to_db({
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "PostgreSQL",
            "table_name": "N/A",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        print(f"Failed to extract tables: {e}")
        return {}

In [19]:
# Extract from db

tables = extract_from_db()
print(f"Extracted tables: {list(tables.keys())}")

Found tables: ['company', 'acquisition', 'funding_rounds', 'funds', 'investments', 'ipos']
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: company
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: acquisition
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funding_rounds
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funds
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: investments
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: ipos
Extracted tables: ['company', 'acquisition', 'funding_rounds', 'funds', 'investments', 'ipos']


In [20]:
# Read all table

df_acquisition = tables["acquisition"]
df_company = tables["company"]
df_funding_rounds = tables["funding_rounds"]
df_funds = tables["funds"]
df_investments = tables["investments"]
df_ipos = tables["ipos"]

# check
df_company.show()

+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|office_id|object_id|      description|              region|            address1|  address2|          city|  zip_code|state_code|country_code| latitude|  longitude|         created_at|         updated_at|
+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|        8|      c:8|                 |              SF Bay|959 Skyway Road, ...|          |    San Carlos|     94070|        CA|         USA|37.506885|-122.247573|2007-01-01 22:19:54|2007-01-01 22:19:54|
|        9|      c:9|     Headquarters|         Los Angeles|9229 W. Sunset Blvd.|          |West Hollywood|     90069|        CA|         USA|34.090368|-118.393064|2007-01-01 22:19

### From API

In [21]:
import requests

In [None]:
def extract_api(link_api: str, list_parameter: dict, data_name: str):
    try:
        # Establish connection to API
        resp = requests.get(link_api, params=list_parameter)
        resp.raise_for_status()  # Raise an exception for HTTP errors

        # Parse the response JSON
        raw_response = resp.json()

        # Convert JSON data to pandas DataFrame
        df_api = pd.DataFrame(raw_response)

        if df_api.empty:
            raise ValueError("Empty response from API")

        # Convert pandas DataFrame to PySpark DataFrame
        spark_df = spark.createDataFrame(df_api)

        # Log success
        log_msg = {
            "step": "Extract",
            "status": "Success",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_db(log_msg)

        print(f"Successfully extracted data from API: {data_name}")
        return spark_df

    except requests.exceptions.RequestException as e:
        # Log request failure
        log_msg = {
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_db(log_msg)
        print(f"Request failed: {e}")

    except ValueError as e:
        # Log parsing failure
        log_msg = {
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_db(log_msg)
        print(f"Parsing error: {e}")

    except Exception as e:
        # Catch any other errors
        log_msg = {
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "API",
            "table_name": data_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        log_to_db(log_msg)
        print(f"An error occurred: {e}")

    return None

In [23]:
# Extract from API (year 2008 - 2010)

link_api = "https://api-milestones.vercel.app/api/data"
list_parameter = {
    "start_date": "2008-01-01",
    "end_date": "2010-12-31"
}

df_milestones = extract_api(link_api, list_parameter, "milestones")
df_milestones.show()

Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted data from API: milestones
+--------------------+--------------------+------------+--------------+------------+---------+--------------------+--------------------+--------------------+
|          created_at|         description|milestone_at|milestone_code|milestone_id|object_id|  source_description|          source_url|          updated_at|
+--------------------+--------------------+------------+--------------+------------+---------+--------------------+--------------------+--------------------+
|2008-06-18 08:14:...|Survives iPhone 3...|  2008-06-09|         other|           1|     c:12|Twitter Fails To ...|http://www.techcr...|2008-06-18 08:14:...|
|2008-06-18 08:50:...|More than 4 Billi...|  2008-06-18|         other|           3|     c:59|11 Billion Videos...|http://www.comsco...|2008-06-18 08:50:...|
|2008-06-19 04:14:...|Reddit goes Open ...|  2008-06-18|         other|           4|    c:314|reddit goes open 

## Load - Staging 

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException

In [11]:
!pip install pangres



In [13]:
# Set Variable for Staging

DB_STAGING_URL = os.getenv("DB_STAGING_URL")
DB_STAGING_USER = os.getenv("DB_STAGING_USER")
DB_STAGING_PASS = os.getenv("DB_STAGING_PASS")

In [None]:
from pyspark.sql.utils import AnalysisException
from pangres import upsert
from sqlalchemy import create_engine
import pandas as pd
from datetime import datetime

def load_staging2(df, table_name, mode="overwrite", use_upsert=False, idx_name=None, schema=None, source=None):
    try:
        if use_upsert:
            # Convert Spark DataFrame to Pandas DataFrame
            data = df.toPandas()

            # Create connection to PostgreSQL
            conn = create_engine(f"postgresql://{DB_STAGING_USER}:{DB_STAGING_PASS}@host.docker.internal:5432/pyspark_task_staging")

            # Set index for upsert
            if idx_name is None:
                raise ValueError("Index name is required for upsert mode")

            data = data.set_index(idx_name)

            # Upsert
            upsert(
                con=conn,
                df=data,
                table_name=table_name,
                schema=schema,
                if_row_exists="update"
            )
            print(f"Data upserted to table '{table_name}' successfully!")
        else:
            # Load using Spark
            df.write \
                .format("jdbc") \
                .option("url", "jdbc:postgresql://host.docker.internal:5432/pyspark_task_staging") \
                .option("dbtable", table_name) \
                .option("user", DB_STAGING_USER) \
                .option("password", DB_STAGING_PASS) \
                .option("driver", "org.postgresql.Driver") \
                .mode(mode) \
                .save()

            print(f"Data loaded to table '{table_name}' successfully!")

        # Success log
        log_msg = {
            "step": "Load Staging",
            "status": "Success",
            "source": "Staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

    except Exception as e:
        print(f"Error loading data to table '{table_name}': {e}")

        # Failed DataFrame
        failed_data = df.toPandas() if not use_upsert else data
        failed_data['error_message'] = str(e)
        failed_data['etl_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Fail log
        log_msg = {
            "step": "Load Staging",
            "status": "Failed",
            "source": "Staging",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "error_message": str(e)
        }

        # Save failed data to CSV
        failed_log_path = f'logs/failed_{table_name}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        failed_data.to_csv(failed_log_path, index=False)
        print(f"Failed data saved to: {failed_log_path}")

    finally:
        # Delete error_message before save it to log
        if 'error_message' in log_msg:
            del log_msg['error_message']

        # Simpan log ke CSV
        log_to_db(log_msg, 'etl_log.csv')

    return df if not use_upsert else data


In [34]:
# from API

load_staging2(df_milestones, "milestones")

Data loaded to table 'milestones' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv


DataFrame[created_at: string, description: string, milestone_at: string, milestone_code: string, milestone_id: bigint, object_id: string, source_description: string, source_url: string, updated_at: string]

In [35]:
# from CSV

load_staging2(df_relations, "relationship")
load_staging2(df_people, "people", mode="overwrite")

Data loaded to table 'relationship' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'people' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv


DataFrame[people_id: string, object_id: string, first_name: string, last_name: string, birthplace: string, affiliation_name: string]

In [36]:
# from database

load_staging2(df_acquisition, "acquisition") 
load_staging2(df_funding_rounds, "funding_rounds") 
load_staging2(df_funds, "funds")
load_staging2(df_investments, "investments")
load_staging2(df_ipos, "ipo")
load_staging2(df_company, "company") 

Data loaded to table 'acquisition' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'funding_rounds' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'funds' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'investments' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'ipo' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv
Data loaded to table 'company' successfully!
Log written to /home/jovyan/work/logs/etl_log.csv


DataFrame[office_id: int, object_id: string, description: string, region: string, address1: string, address2: string, city: string, zip_code: string, state_code: string, country_code: string, latitude: decimal(9,6), longitude: decimal(9,6), created_at: timestamp, updated_at: timestamp]

## Extract Data from Staging

In [15]:
spark.catalog.clearCache()

In [14]:
# Set Variable for Staging

DB_STAGING_URL = os.getenv("DB_STAGING_URL")
DB_STAGING_USER = os.getenv("DB_STAGING_USER")
DB_STAGING_PASS = os.getenv("DB_STAGING_PASS")

In [17]:
def extract_from_staging():
    try:
        # Get list of tables from staging
        table_list = spark.read \
            .format("jdbc") \
            .option("url", DB_STAGING_URL) \
            .option("dbtable", "(SELECT table_name FROM information_schema.tables WHERE table_schema = 'public') AS tbl") \
            .option("user", DB_STAGING_USER) \
            .option("password", DB_STAGING_PASS) \
            .option("driver", "org.postgresql.Driver") \
            .load() \
            .select("table_name") \
            .rdd.flatMap(lambda x: x).collect()

        print(f"Found tables in staging: {table_list}")

        tables = {}
        for table in table_list:
            try:
                # Read each table into a DataFrame
                df = spark.read \
                    .format("jdbc") \
                    .option("url", DB_STAGING_URL) \
                    .option("dbtable", table) \
                    .option("user", DB_STAGING_USER) \
                    .option("password", DB_STAGING_PASS) \
                    .option("driver", "org.postgresql.Driver") \
                    .load()

                tables[table] = df

                # Log success for each table
                log_to_db({
                    "step": "Extract",
                    "status": "Success",
                    "source": "PostgreSQL (Staging)",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

                print(f"Successfully extracted table: {table}")

            except Exception as e:
                # Log failure for specific table
                log_to_db({
                    "step": "Extract",
                    "status": f"Failed: {e}",
                    "source": "PostgreSQL (Staging)",
                    "table_name": table,
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })
                print(f"Failed to extract table: {table} - Error: {e}")

        return tables
    
    except Exception as e:
        # Log failure for the whole extraction process
        log_to_db({
            "step": "Extract",
            "status": f"Failed: {e}",
            "source": "PostgreSQL (Staging)",
            "table_name": "N/A",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        print(f"Failed to extract tables: {e}")
        return {}

In [66]:
# Extract All Tables from Staging

data = extract_from_staging()
print(f"Extracted tables: {list(data.keys())}")

Found tables in staging: ['investments', 'relationship', 'people', 'ipo', 'company', 'acquisition', 'funding_rounds', 'milestones', 'funds']
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: investments
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: relationship
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: people
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: ipo
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: company
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: acquisition
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funding_rounds
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: milestones
Log written to /home/jovyan/work/logs/etl_log.csv
Successfully extracted table: funds
Extracted tables: ['investments', 'relationship', '

In [68]:
# Read All Data from Staging

acquisition = data["acquisition"]
company = data["company"]
funding_rounds = data["funding_rounds"]
funds = data["funds"]
investments = data["investments"]
ipos = data["ipo"]
milestones = data["milestones"]
people = data["people"]
relationship = data["relationship"]

# check
company.show()

+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|office_id|object_id|      description|              region|            address1|  address2|          city|  zip_code|state_code|country_code| latitude|  longitude|         created_at|         updated_at|
+---------+---------+-----------------+--------------------+--------------------+----------+--------------+----------+----------+------------+---------+-----------+-------------------+-------------------+
|        8|      c:8|                 |              SF Bay|959 Skyway Road, ...|          |    San Carlos|     94070|        CA|         USA|37.506885|-122.247573|2007-01-01 22:19:54|2007-01-01 22:19:54|
|        9|      c:9|     Headquarters|         Los Angeles|9229 W. Sunset Blvd.|          |West Hollywood|     90069|        CA|         USA|34.090368|-118.393064|2007-01-01 22:19

## Data Profiling

In [None]:
import os
import json
from datetime import datetime, date
from decimal import Decimal

# Helper function to convert values to JSON format
def convert_to_serializable(obj):
    if isinstance(obj, Decimal):
        return float(obj)
    if isinstance(obj, (datetime, date)):
        return obj.isoformat()
    return obj

def profile_data(person, df, table_name, format_file):
    try:
        n_rows = df.count()
        n_cols = len(df.columns)
        
        column_info = {}
        for col in df.columns:
            data_type = df.schema[col].dataType.simpleString()
            sample_values = df.select(col).distinct().limit(5).rdd.flatMap(lambda x: x).collect()
            null_count = df.filter(df[col].isNull()).count()
            unique_count = df.select(col).distinct().count()
            
            # Min and max values (if numeric or date type)
            try:
                min_value = df.agg({col: "min"}).collect()[0][0]
                max_value = df.agg({col: "max"}).collect()[0][0]
            except:
                min_value = None
                max_value = None
            
            # Persentase missing value
            percentage_missing = round((null_count / n_rows) * 100, 2) if n_rows > 0 else 0.0
            
            # Ambil 5 nilai unik sebagai sampel
            unique_values = df.select(col).distinct().limit(5).rdd.flatMap(lambda x: x).collect()
            
            # Persentase valid date (khusus untuk tipe date dan datetime)
            percentage_valid_date = None
            if data_type in ['date', 'timestamp']:
                valid_date_count = df.filter(df[col].isNotNull()).count()
                percentage_valid_date = round((valid_date_count / n_rows) * 100, 2) if n_rows > 0 else 0.0

            column_info[col] = {
                "data_type": data_type,
                "sample_values": [convert_to_serializable(v) for v in sample_values] if sample_values else None,
                "unique_count": unique_count,
                "unique_value": [convert_to_serializable(v) for v in unique_values] if unique_values else None,
                "null_count": null_count,
                "percentage_missing_value": percentage_missing,
                "min_value": convert_to_serializable(min_value),
                "max_value": convert_to_serializable(max_value),
                "percentage_valid_date": percentage_valid_date
            }
        
        dict_profiling = {
            "created_at": datetime.now().isoformat(),
            "person_in_charge": person,
            "profiling_result": {
                "table_name": table_name,
                "format_file": format_file,
                "n_rows": n_rows,
                "n_cols": n_cols,
                "report": column_info
            }
        }
        
        # Save profiling result to JSON
        folder_path = "data_profiling"
        os.makedirs(folder_path, exist_ok=True)

        file_path = os.path.join(folder_path, f"{table_name}_profiling.json")
        with open(file_path, "w") as f:
            json.dump(dict_profiling, f, indent=4, default=convert_to_serializable)

        print(f"Profiling saved to: {file_path}")

        # Create success log message
        log_msg = {
            "step": "Profiling",
            "status": "Success",
            "source": format_file,
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
        }

    except Exception as e:
        print(f"Error profiling table {table_name}: {e}")

        # Create fail log message
        log_msg = {
            "step": "Profiling",
            "status": f"Failed: {e}",
            "source": format_file,
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 
        }

    finally:
        # Save log to CSV
        log_to_db(log_msg)

    return dict_profiling if 'dict_profiling' in locals() else None


In [76]:
# test 1
profiling_result = profile_data("Mr. A", people, "people_data", "from Staging")
print(json.dumps(profiling_result, indent=2))

Profiling saved to: data_profiling/people_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
{
  "created_at": "2025-03-22T09:50:39.336805",
  "person_in_charge": "Mr. A",
  "profiling_result": {
    "table_name": "people_data",
    "format_file": "from Staging",
    "n_rows": 226709,
    "n_cols": 6,
    "report": {
      "people_id": {
        "data_type": "string",
        "sample_values": [
          "177264",
          "296",
          "91421",
          "467",
          "177595"
        ],
        "unique_count": 226709,
        "unique_value": [
          "177264",
          "296",
          "91421",
          "467",
          "177595"
        ],
        "null_count": 0,
        "percentage_missing_value": 0.0,
        "min_value": "1",
        "max_value": "99999",
        "percentage_valid_date": null
      },
      "object_id": {
        "data_type": "string",
        "sample_values": [
          "p:105829",
          "p:73",
          "p:171",
          "p

In [77]:
# Profiling All Data

profile_data("Mr. CCC", relationship, "relationship_data", "from Staging")
profile_data("Mrs. H", acquisition, "acquisition_data", "from Staging")
profile_data("Mrs. OP", company, "company_data", "from Staging")
profile_data("Mr. CCC", funding_rounds, "funding_rounds_data", "from Staging")
profile_data("Mr. A", funds, "funds_data", "from Staging")
profile_data("Mrs. H", investments, "investments_data", "from Staging")
profile_data("Mr. A", ipos, "ipos_data", "from Staging")
profile_data("Mrs. OP", milestones, "milestones_data", "from Staging")

Profiling saved to: data_profiling/relationship_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/acquisition_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/company_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/funding_rounds_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/funds_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/investments_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/ipos_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv
Profiling saved to: data_profiling/milestones_data_profiling.json
Log written to /home/jovyan/work/logs/etl_log.csv


{'created_at': '2025-03-22T09:53:41.820808',
 'person_in_charge': 'Mrs. OP',
 'profiling_result': {'table_name': 'milestones_data',
  'format_file': 'from Staging',
  'n_rows': 8152,
  'n_cols': 9,
  'report': {'created_at': {'data_type': 'string',
    'sample_values': ['2010-09-30 04:46:05.000',
     '2010-10-04 23:53:31.000',
     '2010-05-26 23:08:38.000',
     '2010-07-09 05:01:33.000',
     '2010-07-10 12:53:28.000'],
    'unique_count': 7504,
    'unique_value': ['2010-09-30 04:46:05.000',
     '2010-10-04 23:53:31.000',
     '2010-05-26 23:08:38.000',
     '2010-07-09 05:01:33.000',
     '2010-07-10 12:53:28.000'],
    'null_count': 0,
    'percentage_missing_value': 0.0,
    'min_value': '2008-06-18 08:14:06.000',
    'max_value': '2013-12-10 20:15:30.000',
    'percentage_valid_date': None},
   'description': {'data_type': 'string',
    'sample_values': ["Viewfinity named in 'Hottest Boston Companies' List",
     'Centralway invested in \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

## Transformation

In [20]:
!pip install unidecode



In [21]:
!pip install pangres

Collecting pangres
  Downloading pangres-4.2.1.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m813.0 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pangres
  Building wheel for pangres (setup.py) ... [?25ldone
[?25h  Created wheel for pangres: filename=pangres-4.2.1-py3-none-any.whl size=66624 sha256=13f4c919d3ae6110532b6e97d6caa40dd7bc48a6fb0ac7d3800bb4225e1ab570
  Stored in directory: /home/jovyan/.cache/pip/wheels/4b/a2/b0/4cc3bc0c120d7b5ba88691fc6773b771c88bd18f2152e1d752
Successfully built pangres
Installing collected packages: pangres
Successfully installed pangres-4.2.1


In [15]:
from pyspark.sql.functions import col, lit, to_date, when, udf
from pyspark.sql.types import IntegerType, StringType
from sqlalchemy import create_engine
from pangres import upsert
import pandas as pd
import re
from unidecode import unidecode
import unicodedata

In [16]:
# Setup Connection to Data Warehouse

DWH_URL = os.getenv("DWH_URL")
DWH_USER = os.getenv("DWH_USER")
DWH_PASS = os.getenv("DWH_PASS")
engine = create_engine(f"postgresql://{DWH_USER}:{DWH_PASS}@host.docker.internal:5432/pyspark_task_dwh")

### Helper Function to Analyze Column Values

In [17]:
# Cleaning Values to Make it More Readable

@udf(returnType=IntegerType())
def clean_integer(value):
    if isinstance(value, str):
        match = re.match(r"^[a-zA-Z]:(\d+)", value) 
        if match:
            return int(match.group(1))  # Catch value after ":"
        else:
            return None 
    return value
    

@udf(returnType=StringType())
def clean_text(value):
    if value:
        try:
            # Handle encoding issue 
            value = value.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            pass
        # Normalization
        value = unicodedata.normalize("NFKD", value)
        # Handle strange character
        value = re.sub(r'[^\x00-\x7F]+', '', value)
        value = value.strip()
        value = unidecode(value)
    return value


@udf(returnType=StringType())
def normalize_text(value):
    if not isinstance(value, str) or not value.strip():
        return None  
    
    # Make it lowercase
    value = value.lower()
    
    # HDelete strange char
    value = re.sub(r'[^\w\s,&/]', '', value)  # Alphanumeric, space, coma, apersand, slash
    
    value = re.sub(r'[/,&]', ' ', value) 
    
    # Delete exaggerated space
    value = re.sub(r'\s+', ' ', value).strip()
    
    return value


@udf(returnType=StringType())
def clean_alpha_text(text):
    if text:
        # Delete all strange char, except alphanumeric and space
        return re.sub(r'[^\w\s]', '', text).strip()
    return None


@udf(returnType=StringType())
def fix_encoding(s):
    if s is not None:
        try:
            return unidecode(s)
        except Exception as e:
            return None
    return s

In [18]:
# For Extracting Prefix and Numeric ID

@udf(returnType=StringType())
def extract_prefix(value):
    if value and ":" in value:
        return value.split(":")[0]
    return None

@udf(returnType=IntegerType())
def extract_id(value):
    if value and ":" in value:
        try:
            return int(value.split(":")[1])
        except ValueError:
            return None
    return None

In [19]:
# For Handle Stock-related Column

@udf(returnType=StringType())
def extract_stock_market(value):
    if value and ":" in value:
        return value.split(":")[0]
    return None

@udf(returnType=StringType())
def extract_stock_symbol(value):
    if value and ":" in value:
        return value.split(":")[1]
    return None

### Company Data

In [20]:
from pyspark.sql.functions import col, udf, split
from pyspark.sql.types import StringType, IntegerType
from datetime import datetime

def transform_company(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "staging",
            "table_name": "company",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)

        # Step 2: Format data type
        df = df.withColumn("latitude", col("latitude").cast("decimal(9,6)"))
        df = df.withColumn("longitude", col("longitude").cast("decimal(9,6)"))
        
        # Extract Extract prefix and ID 
        df = df.withColumn("entity_type", extract_prefix(col("object_id")))
        df = df.withColumn("object_id", extract_id(col("object_id")))

        # Step 3: Encoding
        df = df.withColumn("description", clean_text("description"))
        df = df.withColumn("address1", clean_text("address1"))
        df = df.withColumn("zip_code", clean_text("zip_code"))
        df = df.withColumn("region", clean_text("region"))
        
        log_to_db({
            "step": "Format Data",
            "status": "SUCCESS",
            "source": "staging",
            "table_name": "company",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 4: Mapping to target column
        df_transformed = df.select(
            col("office_id").alias("company_id"),
            col("entity_type").alias("company_entity_type"),
            col("object_id").alias("company_object_id"),  # INT
            col("description").alias("description"),
            col("address1").alias("address"),
            col("region").alias("region"),
            col("city").alias("city"),
            col("zip_code").alias("zip_code"),
            col("state_code").alias("state_code"),
            col("country_code").alias("country_code"),
            col("latitude").alias("latitude"),
            col("longitude").alias("longitude"),
            col("created_at").alias("created_at"),
            col("updated_at").alias("updated_at")
        )

        log_to_db({
            "step": "Map Data",
            "status": f"SUCCESS ({df_transformed.count()} rows)",
            "source": "staging",
            "table_name": "company",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 5: Data cleansing 
        df_transformed = df_transformed.fillna({
            "description": "Unknown",
            "address": "Unknown",
            "region": "Unknown",
            "city": "Unknown",
            "zip_code": "Unknown",
            "state_code": "Unknown",
            "country_code": "Unknown"
        })

        # Step 6: Drop duplicate data  and latitude/longitude with value = 0
        df_transformed = df_transformed.dropDuplicates(["company_object_id"])
        df_transformed = df_transformed.filter((col("latitude") != 0) & (col("longitude") != 0))

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "company",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        
        print("The data is successfully transformed")

        return df_transformed

    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "company",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise

    


In [21]:
# Read from staging
company = spark.read \
       .format("jdbc") \
       .option("url", DB_STAGING_URL) \
       .option("dbtable", "company") \
       .option("user", DB_STAGING_USER) \
       .option("password", DB_STAGING_PASS) \
       .option("driver", "org.postgresql.Driver") \
       .load()

In [22]:
transformed_company = transform_company(company)

Log successfully written to database
Log successfully written to database
Log successfully written to database
Log successfully written to database
The data is successfully transformed


### People Data

In [23]:
from pyspark.sql.functions import col, concat_ws, broadcast
from datetime import datetime

def transform_people(df, enable_company_validation=False): # kalau dibutuhkan validasi ke company, ubah jadi true
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "Staging",
            "table_name": "people",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace empty strings with NULL
        df = df.na.replace("", None)

        # Step 2: Create full_name from first_name + last_name
        df = df.withColumn("full_name", concat_ws(" ", col("first_name"), col("last_name")))

        # Step 3: Extract prefix and ID
        df = df.withColumn("people_entity_type", extract_prefix(col("object_id")))
        df = df.withColumn("people_object_id", extract_id(col("object_id")))

        log_to_db({
            "step": "Format Data",
            "status": f"SUCCESS ({df.count()} rows)",
            "source": "staging",
            "table_name": "people",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 4: Mapping target column
        df_transformed = df.select(
            col("people_id").alias("people_id"),
            col("people_entity_type"),
            col("people_object_id"),
            col("full_name"),
            col("birthplace"),
            col("affiliation_name")
        )

        # Step 5: Cleaning
        df_transformed = df_transformed.withColumn("people_object_id", clean_integer(col("people_object_id")))
        df_transformed = df_transformed.withColumn("full_name", clean_alpha_text(col("full_name")))
        df_transformed = df_transformed.withColumn("birthplace", fix_encoding(col("birthplace")))
        df_transformed = df_transformed.withColumn("affiliation_name", clean_alpha_text(col("affiliation_name")))

        log_to_db({
            "step": "Map Data",
            "status": f"SUCCESS ({df_transformed.count()} rows)",
            "source": "staging",
            "table_name": "people",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 6: Fillna & dedup
        df_transformed = df_transformed.fillna({
            "full_name": "Unknown",
            "birthplace": "Unknown",
            "affiliation_name": "Unknown"
        })
        df_transformed = df_transformed.dropDuplicates(["people_entity_type", "people_object_id"])
        df_transformed = df_transformed.filter(col("full_name") != "Unknown")

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "people",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 7: OPTIONAL validation against dim_company
        if enable_company_validation and df_company is not None:
            df_company_ids = df_company.select(col("company_object_id"))

            df_valid = df_transformed.join(
                broadcast(df_company_ids),
                on=col("people_object_id") == col("company_object_id"),
                how="inner"
            ).drop("company_object_id")

            df_invalid = df_transformed.join(
                broadcast(df_company_ids),
                on=col("people_object_id") == col("company_object_id"),
                how="left_anti"
            )

            if df_invalid.count() > 0:
                invalid_ids = df_invalid.select("people_entity_type", "people_object_id").toPandas().values.tolist()
                save_invalid_ids(invalid_ids, table_name="people")

                log_to_db({
                    "step": "Validation",
                    "status": f"{df_invalid.count()} rows missing object_id",
                    "source": "staging",
                    "table_name": "people",
                    "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

            return df_valid
        else:
            return df_transformed

    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "people",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [24]:
# Read from staging
people = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "people") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [25]:
# Transform People Data
transformed_people = transform_people(people)

Log successfully written to database
Log successfully written to database
Log successfully written to database
Log successfully written to database


### Milestones Data

In [25]:
def transform_milestones(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "staging",
            "table_name": "milestones",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)
        df = df.na.replace("NaN", None)
        
        # Step 2: Format data type
        df = df.withColumn("milestone_date", to_date(col("milestone_at")))

        # Step 3: Extract prefix and ID dari object_id
        df = df.withColumn("entity_type", extract_prefix(col("object_id")))
        df = df.withColumn("object_id", extract_id(col("object_id")))
        
        log_to_db({
            "step": "Format Data",
            "status": "SUCCESS",
            "source": "staging",
            "table_name": "milestones",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 4: Mapping to target column
        df_transformed = df.select(
            col("milestone_id").alias("milestone_id"),
            col("entity_type").alias("milestone_entity_type"),
            col("object_id").alias("milestone_object_id"),
            col("milestone_date").alias("milestone_date"),
            col("description").alias("description"),
            col("source_url").alias("source_url"),
            col("source_description").alias("source_description"),
            col("created_at").alias("created_at"),
            col("updated_at").alias("updated_at")
        )
            
        # Step 5: Handle strange values
        df_transformed = df_transformed.withColumn("milestone_object_id", clean_integer(col("milestone_object_id")))
        df_transformed = df_transformed.withColumn("description", clean_alpha_text("description"))
        df_transformed = df_transformed.withColumn("source_url", when(col("source_url").rlike(r"^(http|https)://.*"), col("source_url")).otherwise("Unknown"))
        df_transformed = df_transformed.withColumn("source_description", clean_alpha_text("source_description"))

        log_to_db({
            "step": "Map Data",
            "status": f"SUCCESS ({df_transformed.count()} rows)",
            "source": "staging",
            "table_name": "milestones",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 6: Cleaning data
        df_transformed = df_transformed.fillna({
            "source_url": "Unknown",
            "description": "No Description",
            "source_description": "Unknown"
        })
        
        # Step 7: Drop duplicate data 
        df_transformed = df_transformed.dropDuplicates(["milestone_id"])
        df_transformed = df_transformed.dropDuplicates(["milestone_entity_type", "milestone_object_id"])

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "milestones",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 8: Validation object_id in company
        # Take object_id from transform_company
        df_company_ids = transformed_company.select(col("company_object_id"))

        # Convert object_id to integer (if needed)
        df_transformed = df_transformed.withColumn("milestone_object_id", col("milestone_object_id").cast("int"))
        
        # Filter NULL object_id explicitly
        df_transformed = df_transformed.filter(col("milestone_object_id").isNotNull())
        
        # Valid data (match with `transformed_company`)
        df_valid = df_transformed.join(
            broadcast(df_company_ids),
            on=col("milestone_object_id") == col("company_object_id"),
            how="inner"
        ).drop("company_object_id")
        
        # Invalid data
        df_invalid = df_transformed.join(
            broadcast(df_company_ids),
            on=col("milestone_object_id") == col("company_object_id"),
            how="left_anti"
        )

        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select("milestone_entity_type", "milestone_object_id").toPandas().values.tolist()
            save_invalid_ids(invalid_ids, table_name="milestones")
        
            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows missing object_id",
                "source": "staging",
                "table_name": "milestones",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

        print("The data is successfully transformed")

        return df_valid


    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "milestones",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [26]:
# Read from staging
milestones = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "milestones") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [27]:
# Transform Milestones Data
transformed_milestones = transform_milestones(milestones)


Log successfully written to database
Log successfully written to database
Log successfully written to database
Log successfully written to database
2305 invalid IDs from table 'milestones' saved to logger DB.
Log successfully written to database
The data is successfully transformed


### Acquisition Data - Fact

In [72]:
from pyspark.sql.functions import col, to_date, broadcast
from pyspark.sql.types import IntegerType
from datetime import datetime
from src.utils.logger import log_to_db, save_invalid_ids
from src.utils.helper import extract_prefix, extract_id

def transform_acquisition(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "staging",
            "table_name": "acquisition",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Clean & basic transform
        df = df.na.replace("", None)
        df = df.withColumn("price_amount", col("price_amount").cast("decimal(15,2)"))
        df = df.withColumn("acquired_at", to_date(col("acquired_at")))
        df = df.withColumn("acquiring_entity_type", extract_prefix(col("acquiring_object_id")))
        df = df.withColumn("acquired_entity_type", extract_prefix(col("acquired_object_id")))
        df = df.withColumn("acquiring_object_id", extract_id(col("acquiring_object_id")).cast(IntegerType()))
        df = df.withColumn("acquired_object_id", extract_id(col("acquired_object_id")).cast(IntegerType()))

        # Select relevant columns
        df_transformed = df.select(
            "acquisition_id", "acquiring_entity_type", "acquiring_object_id",
            "acquired_entity_type", "acquired_object_id", "price_amount",
            "price_currency_code", "acquired_at", "source_url", "created_at", "updated_at"
        ).fillna({
            "price_amount": 0.0,
            "price_currency_code": "Unknown",
            "source_url": "Unknown"
        }).dropDuplicates(["acquisition_id"]) \
         .filter(col("price_amount") != 0.0) \
         .na.drop(subset=["acquired_at"])

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "acquisition",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Validate object_id against dim_company
        df_company_ids = transformed_company.select("company_object_id", "company_id")

        df_validated = (
            df_transformed
            .join(broadcast(df_company_ids.withColumnRenamed("company_object_id", "acq_obj_id")
                           .withColumnRenamed("company_id", "acq_id")),
                  col("acquiring_object_id") == col("acq_obj_id"), "left")
            .join(broadcast(df_company_ids.withColumnRenamed("company_object_id", "acqed_obj_id")
                           .withColumnRenamed("company_id", "acqed_id")),
                  col("acquired_object_id") == col("acqed_obj_id"), "left")
        )
        
        df_valid = (
            df_validated
            .filter(col("acq_obj_id").isNotNull() & col("acqed_obj_id").isNotNull())
            .select(
                "acquisition_id",
                "acquiring_entity_type",
                "acquired_entity_type",
                "price_amount",
                "price_currency_code",
                "acquired_at",
                "source_url",
                "created_at",
                "updated_at",
                col("acq_obj_id").alias("acquiring_object_id"),
                col("acqed_obj_id").alias("acquired_object_id")
            )
        )

        # Invalid records
        df_invalid = df_validated.filter(
            col("acq_obj_id").isNull() | col("acqed_obj_id").isNull()
        )


        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select(
                "acquiring_entity_type", "acquiring_object_id",
                "acquired_entity_type", "acquired_object_id"
            ).toPandas().values.tolist()

            save_invalid_ids(invalid_ids, table_name="acquisition")

            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows with missing object_id in dim_company",
                "source": "staging",
                "table_name": "acquisition",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

        print("The data is successfully transformed")
        return df_valid

    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "acquisition",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [73]:
# Read from staging
acquisition = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "acquisition") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [74]:
# transform acquisition data

transformed_acquisition = transform_acquisition(acquisition)

Log successfully written to database
Log successfully written to database
1329 invalid IDs from table 'acquisition' saved to logger DB.
Log successfully written to database
The data is successfully transformed


### Investments Data - Fact

In [31]:
from pyspark.sql.functions import col, to_date, broadcast
from datetime import datetime

def transform_investments(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "Staging",
            "table_name": "investments",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)

        # Step 2: Extract prefix and ID
        df = df.withColumn("funded_entity_type", extract_prefix(col("funded_object_id")))
        df = df.withColumn("investor_entity_type", extract_prefix(col("investor_object_id")))
        df = df.withColumn("funded_object_id", extract_id(col("funded_object_id")))
        df = df.withColumn("investor_object_id", extract_id(col("investor_object_id")))

        log_to_db({
            "step": "Format Data",
            "status": f"SUCCESS ({df.count()} rows)",
            "source": "staging",
            "table_name": "investments",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 3: Mapping to target column
        df_transformed = df.select(
            col("investment_id").alias("investment_id"),
            col("funding_round_id").alias("funding_round_id"),
            col("funded_entity_type").alias("funded_entity_type"),
            col("funded_object_id").alias("funded_object_id"),
            col("investor_entity_type").alias("investor_entity_type"),
            col("investor_object_id").alias("investor_object_id"),
            col("created_at").alias("created_at"),
            col("updated_at").alias("updated_at")
        )

        # Step 4: Clean integer values
        df_transformed = df_transformed.withColumn("funded_object_id", clean_integer(col("funded_object_id")))
        df_transformed = df_transformed.withColumn("investor_object_id", clean_integer(col("investor_object_id")))
        
        log_to_db({
            "step": "Map Data",
            "status": f"SUCCESS ({df_transformed.count()} rows)",
            "source": "staging",
            "table_name": "investments",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 6: Drop duplicates
        df_transformed = df_transformed.dropDuplicates(["investment_id"])

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "investments",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 7: Validation object_id in company data and people data
        # Take object_id from transform_company
        df_company_ids = transformed_company.select("company_object_id", "company_id")

        # Take object_id from transform_people
        df_people_ids = transformed_people.select("people_object_id", "people_id") 

        # Join with companies and people tables
        df_transformed = df_transformed \
            .join(df_company_ids.alias("comp"), df_transformed["investor_object_id"] == col("comp.company_object_id"), "left") \
            .join(df_people_ids.alias("peop"), df_transformed["investor_object_id"] == col("peop.people_object_id"), "left") \
            .withColumn("investor_object_id", when(col("comp.company_id").isNotNull(), col("comp.company_id"))
                        .otherwise(col("peop.people_id"))) \
            .withColumn("investor_entity_type", when(col("comp.company_id").isNotNull(), lit("company"))
                        .otherwise(lit("people")))

        df_transformed = df_transformed \
            .join(df_company_ids.alias("funded"), df_transformed["funded_object_id"] == col("funded.company_object_id"), "left") \
            .withColumn("funded_object_id", col("funded.company_id"))

        df_transformed = df_transformed.select(
            "investment_id",
            "funding_round_id",
            "funded_entity_type",
            "funded_object_id",  # ini udah di-replace dengan company_id hasil join
            "investor_entity_type",
            "investor_object_id",
            "created_at",
            "updated_at"
        )


        # Validation (check if object_id mapping failed)
        df_invalid = df_transformed.filter(col("investor_object_id").isNull() | col("funded_object_id").isNull())
        df_valid = df_transformed.filter(col("investor_object_id").isNotNull() & col("funded_object_id").isNotNull())

        
        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select("investor_entity_type", "investor_object_id", 
                                            "funded_entity_type", "funded_object_id").toPandas().values.tolist()
            save_invalid_ids(invalid_ids, table_name="investments")
            
            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows with missing object_id",
                "source": "staging",
                "table_name": "investments",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

        print("The data is successfully transformed")

        return df_valid
    
    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "investments",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [32]:
# Read from staging
investments = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "investments") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()

In [33]:
# Transform investments data

transformed_investments = transform_investments(investments)

Log successfully written to database
Log successfully written to database
Log successfully written to database
Log successfully written to database
34648 invalid IDs from table 'investments' saved to logger DB.
Log successfully written to database
The data is successfully transformed


### Funding Rounds Data

In [78]:
from pyspark.sql.functions import col, to_date, broadcast, when, lit
from pyspark.sql.types import IntegerType, StringType
from datetime import datetime

def transform_funding_rounds(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "Staging",
            "table_name": "funding_rounds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)

        # Step 2: Extract prefix and ID
        df = df.withColumn("funding_entity_type", extract_prefix(col("object_id")))
        df = df.withColumn("object_id", extract_id(col("object_id")))
        df = df.withColumn("object_id", col("object_id").cast(IntegerType()))

        # Step 3: Format data type
        df = df.withColumn("funding_date", to_date(col("funded_at")))
        df = df.withColumn("funding_entity_type", col("funding_entity_type").cast(StringType()))
        df = df.withColumn("participants", col("participants").cast(IntegerType()))

        log_to_db({
            "step": "Format Data",
            "status": f"SUCCESS ({df.count()} rows)",
            "source": "staging",
            "table_name": "funding_rounds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 4: Mapping to target column
        df_transformed = df.select(
            col("funding_round_id"),
            col("funding_entity_type"),
            col("object_id").alias("funding_object_id"),
            col("funding_round_type").alias("round_type"),
            col("funding_date"),
            col("raised_currency_code").alias("raised_currency"),
            col("raised_amount"),
            col("raised_amount_usd"),
            col("pre_money_currency_code").alias("pre_money_currency"),
            col("pre_money_valuation"),
            col("pre_money_valuation_usd"),
            col("post_money_currency_code").alias("post_money_currency"),
            col("post_money_valuation"),
            col("post_money_valuation_usd"),
            col("participants"),
            col("source_url"),
            col("source_description"),
            col("created_at"),
            col("updated_at")
        )

        # Step 5: Handle null values
        df_transformed = df_transformed.fillna({
            "round_type": "Unknown",
            "raised_currency": "USD",
            "pre_money_currency": "USD",
            "post_money_currency": "USD",
            "raised_amount_usd": 0.0,
            "source_description": "Unknown",
        })

        # Step 6: Drop duplicates dan invalid data
        df_transformed = df_transformed.dropDuplicates(["funding_round_id"])
        df_transformed = df_transformed.filter(col("round_type") != "Unknown")

        df_transformed = df_transformed.withColumn(
            "source_url", when(col("source_url").rlike(r"^(http|https)://.*"), col("source_url")).otherwise("Unknown")
        )
        df_transformed = df_transformed.withColumn(
            "source_description", clean_alpha_text("source_description")
        )

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "funding_rounds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 7: Load dim_company and dim_people for validation
        # Take object_id from transform_company
        df_company_ids = transformed_company.select("company_object_id", "company_id")

        # Take object_id from transform_people
        df_people_ids = transformed_people.select("people_object_id", "people_id") 

        # Validation on funded_object_id
        df_valid = df_transformed \
            .join(broadcast(df_company_ids), df_transformed["funding_object_id"] == df_company_ids["company_object_id"], "left") \
            .join(broadcast(df_people_ids), df_transformed["funding_object_id"] == df_people_ids["people_object_id"], "left") \
            .withColumn("funding_object_id",
                        when(col("company_id").isNotNull(), col("company_id"))
                        .otherwise(col("people_id"))) \
            .drop("company_id", "people_id")

        df_invalid = df_valid.filter(col("company_object_id").isNull() | col("people_object_id").isNull() | col("funding_object_id").isNull())
        df_valid = df_valid.filter(col("company_object_id").isNotNull() & col("people_object_id").isNotNull() & col("funding_object_id").isNotNull())

        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select("funding_entity_type", "funding_object_id").toPandas().values.tolist()
            # invalid_ids = df_invalid.limit(10).toPandas().values.tolist()
            save_invalid_ids(invalid_ids, table_name="funding_rounds")
            
            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows with missing object_id",
                "source": "staging",
                "table_name": "funding_rounds",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

        print("The data is successfully transformed")

        return df_valid
    
    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "funding_rounds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise

In [79]:
# Read from staging

funding_rounds = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "funding_rounds") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [80]:
# Transform funding rounds

transformed_funding_rounds = transform_funding_rounds(funding_rounds)

Log successfully written to database
Log successfully written to database
Log successfully written to database
34100 invalid IDs from table 'funding_rounds' saved to logger DB.
Log successfully written to database
The data is successfully transformed


### Relationship Data - Fact

In [82]:
from pyspark.sql.functions import col, to_date, to_timestamp, broadcast, when
from datetime import datetime
from src.utils.logger import log_to_db, save_invalid_ids
from src.utils.helper import extract_prefix, extract_id, normalize_text

def transform_relationship(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)

        # Step 2: Format data types
        df = df.withColumn("start_at", to_date(col("start_at")))
        df = df.withColumn("end_at", to_date(col("end_at")))
        df = df.withColumn("created_at", to_timestamp(col("created_at")))
        df = df.withColumn("updated_at", to_timestamp(col("updated_at")))

        # Step 3: Extract IDs & clean
        df = df.withColumn("people_entity_type", extract_prefix(col("person_object_id")))
        df = df.withColumn("relationship_entity_type", extract_prefix(col("relationship_object_id")))
        df = df.withColumn("people_object_id", extract_id(col("person_object_id")))
        df = df.withColumn("relationship_object_id", extract_id(col("relationship_object_id")))
        df = df.withColumn("title", normalize_text(col("title")))

        # Step 4: Select target columns
        df_transformed = df.select(
            col("relationship_id"),
            col("people_entity_type"),
            col("people_object_id"),
            col("relationship_entity_type"),
            col("relationship_object_id"),
            col("start_at"),
            col("end_at"),
            col("title"),
            col("created_at"),
            col("updated_at")
        )

        # Step 5: Fill null values
        df_transformed = df_transformed.fillna({"title": "Unknown"})

        # Step 6: Drop duplicates by primary key
        df_transformed = df_transformed.dropDuplicates(["relationship_id"])

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 7: Prepare company & people IDs for validation
        df_company_ids = transformed_company.select("company_object_id", "company_id")
        df_people_ids = transformed_people.select("people_object_id", "people_id")

        # Step 8: Join to validate relationship_object_id (company/people)
        df_validated = df_transformed \
            .join(broadcast(df_company_ids), df_transformed["relationship_object_id"] == df_company_ids["company_object_id"], "left") \
            .join(broadcast(df_people_ids), df_transformed["relationship_object_id"] == df_people_ids["people_object_id"], "left") \
            .withColumn("relationship_object_id", when(
                col("company_id").isNotNull(), col("company_id")
            ).otherwise(col("people_id"))) \
            .drop("company_id", "people_id", "company_object_id", "people_object_id")

        # Step 9: Join to validate people_object_id (from person)
        df_validated = df_validated \
            .join(broadcast(df_people_ids), df_transformed["people_object_id"] == df_people_ids["people_object_id"], "left") \
            .withColumn("people_object_id", col("people_id")) \
            .drop("people_id")

        # Step 10: Filter invalid rows
        df_invalid = df_validated.filter(
            col("relationship_object_id").isNull() | col("people_object_id").isNull()
        )
        df_valid = df_validated.filter(
            col("relationship_object_id").isNotNull() & col("people_object_id").isNotNull()
        )

        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select(
                "relationship_entity_type", "relationship_object_id"
            ).toPandas().values.tolist()
            save_invalid_ids(invalid_ids, table_name="relationship")
            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows with missing object_id",
                "source": "staging",
                "table_name": "relationship",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })
            
        # Rename hasil validasi (overwrite nilai lama)
        df_valid = df_valid.drop("people_object_id").withColumnRenamed("people_id", "people_object_id")


        print("The data is successfully transformed")
        return df_valid

    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [90]:
from pyspark.sql.functions import col, to_date, broadcast, to_timestamp
from datetime import datetime

def transform_relationship(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "Staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)

        # Step 2: Format data type
        df = df.withColumn("start_at", to_date(col("start_at")))
        df = df.withColumn("end_at", to_date(col("end_at")))
        df = df.withColumn("created_at", to_timestamp(col("created_at")))
        df = df.withColumn("updated_at", to_timestamp(col("updated_at")))

        # Step 3: Extract prefix and ID, normalize
        df = df.withColumn("people_entity_type", extract_prefix(col("person_object_id")))
        df = df.withColumn("relationship_entity_type", extract_prefix(col("relationship_object_id")))
        df = df.withColumn("people_object_id", extract_id(col("person_object_id")))
        df = df.withColumn("relationship_object_id", extract_id(col("relationship_object_id")))
        df = df.withColumn("title", normalize_text(col("title")))

        log_to_db({
            "step": "Format Data",
            "status": f"SUCCESS ({df.count()} rows)",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 3: Mapping to target column
        df_transformed = df.select(
            col("relationship_id"),
            col("people_entity_type").alias("people_entity_type"),
            col("people_object_id").alias("people_object_id"),
            col("relationship_entity_type").alias("relationship_entity_type"),
            col("relationship_object_id").alias("relationship_object_id"),
            col("start_at").alias("start_at"),
            col("end_at").alias("end_at"),
            col("title").alias("title"),
            col("created_at").alias("created_at"),
            col("updated_at").alias("updated_at")
        )
        
        log_to_db({
            "step": "Map Data",
            "status": f"SUCCESS ({df_transformed.count()} rows)",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 4: Handle null values
        df_transformed = df_transformed.fillna({
            "title": "Unknown"
        })

        # Step 5: Drop duplicates and null values
        df_transformed = df_transformed.dropDuplicates(["relationship_id"])

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 6: Load dim_company dan dim_people for validation
        # Take object_id from transform_company
        df_company_ids = transformed_company.select("company_object_id", "company_id")

        # Take object_id from transform_people
        df_people_ids = transformed_people.select("people_object_id", "people_id") 

        # Validation on funded_object_id
        df_valid = df_transformed \
            .join(broadcast(df_company_ids), df_transformed["relationship_object_id"] == df_company_ids["company_object_id"], "left") \
            .join(broadcast(df_people_ids), df_transformed["relationship_object_id"] == df_people_ids["people_object_id"], "left") \
            .withColumn("relationship_object_id",
                        when(col("company_id").isNotNull(), col("company_id"))
                        .otherwise(col("people_id"))) \
            .drop("company_id", "people_id")

        df_invalid = df_valid.filter(col("company_id").isNull() | col("people_id").isNull() | col("relationship_object_id").isNull())
        df_valid = df_valid.filter(col("company_id").isNotNull() & col("people_id").isNotNull() & col("relationship_object_id").isNotNull())
        
        # DROP yang lama biar gak dobel
        df_valid = df_valid.drop("people_object_id")
        
        # Rename hasil validasi
        df_valid = df_valid.withColumnRenamed("people_id", "people_object_id")

        
        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select("relationship_entity_type", "relationship_object_id").toPandas().values.tolist()
            # invalid_ids = df_invalid.limit(10).toPandas().values.tolist()
            save_invalid_ids(invalid_ids, table_name="relationship")

            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows with missing object_id",
                "source": "staging",
                "table_name": "relationship",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })

        print("The data is successfully transformed")

        return df_valid


    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "relationship",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [91]:
# Read from staging
relationship = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "relationship") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [92]:
# Transform relationship data

transformed_relationship = transform_relationship(relationship)


Log successfully written to database
Log successfully written to database
Log successfully written to database
Log successfully written to database
85169 invalid IDs from table 'relationship' saved to logger DB.
Log successfully written to database
The data is successfully transformed


### IPO Data - Fact

In [74]:
def transform_ipo(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "Staging",
            "table_name": "ipo",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 1: Replace "" to null
        df = df.na.replace("", None)

        # Step 2: Format data type
        df = df.withColumn("public_at", to_date(col("public_at")))
        df = df.withColumn("created_at", to_timestamp(col("created_at")))
        df = df.withColumn("updated_at", to_timestamp(col("updated_at")))

        # Step 3: Extract and normalize
        df = df.withColumn("ipo_entity_type", extract_prefix(col("object_id")))
        df = df.withColumn("ipo_object_id", extract_id(col("object_id")))
        df = df.withColumn("stock_market", extract_stock_market(col("stock_symbol")))
        df = df.withColumn("stock_symbol", extract_stock_symbol(col("stock_symbol")))

        log_to_db({
            "step": "Format Data",
            "status": f"SUCCESS ({df.count()} rows)",
            "source": "staging",
            "table_name": "ipo",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 4: Select target columns
        df_transformed = df.select(
            col("ipo_id"),
            col("ipo_entity_type"),
            col("ipo_object_id"),
            col("valuation_currency_code").alias("valuation_currency"),
            col("valuation_amount"),
            col("raised_currency_code").alias("raised_currency"),
            col("raised_amount"),
            col("public_at"),
            col("stock_market"),
            col("stock_symbol"),
            col("source_url"),
            col("source_description"),
            col("created_at"),
            col("updated_at")
        )

        # Step 5: Fill null values
        df_transformed = df_transformed.fillna({
            "valuation_amount": 0.0,
            "valuation_currency": "USD",
            "raised_amount": 0.0,
            "raised_currency": "USD",
            "stock_market": "N/A",
            "stock_symbol": "N/A",
            "source_url": "Unknown",
            "source_description": "Unknown"
        })

        # Step 6: Drop duplicate IPOs
        df_transformed = df_transformed.dropDuplicates(["ipo_id"])

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "ipo",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # Step 7: Validasi ke dim_company dan dim_people
        df_company_ids = transformed_company.select("company_object_id", "company_id")
        df_people_ids = transformed_people.select("people_object_id", "people_id")

        df_valid = (
            df_transformed
            .join(broadcast(df_company_ids), df_transformed["ipo_object_id"] == df_company_ids["company_object_id"], "left")
            .join(broadcast(df_people_ids), df_transformed["ipo_object_id"] == df_people_ids["people_object_id"], "left")
            .withColumn("ipo_object_id",
                        when(col("company_id").isNotNull(), col("company_id"))
                        .otherwise(col("people_id")))
            .drop("company_id", "people_id", "company_object_id", "people_object_id")
        )

        # Step 8: Filter invalid dan logging
        df_invalid = df_valid.filter(col("ipo_object_id").isNull())
        df_valid = df_valid.filter(col("ipo_object_id").isNotNull())

        if df_invalid.count() > 0:
            invalid_ids = df_invalid.select("ipo_entity_type", "ipo_id").toPandas().values.tolist()
            save_invalid_ids(invalid_ids, table_name="ipo")
            log_to_db({
                "step": "Validation",
                "status": f"{df_invalid.count()} rows with missing object_id",
                "source": "staging",
                "table_name": "ipo",
                "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            })


        # Validasi: pisahkan berdasarkan entitas
        df_company_ipo = df_valid.filter(col("ipo_entity_type") == "c") \
            .join(transformed_company.select("company_id"), col("ipo_object_id") == col("company_id"), "inner") \
            .drop("company_id")
        
        df_people_ipo = df_valid.filter(col("ipo_entity_type") == "p") \
            .join(transformed_people.select("people_id"), col("ipo_object_id") == col("people_id"), "inner") \
            .drop("people_id")
        
        # Gabungkan kembali
        df_final = df_company_ipo.unionByName(df_people_ipo)

        print("The data is successfully transformed")

        return df_final

    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "ipo",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise

In [75]:
# Read from staging

ipo = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "ipo") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [76]:
# Transform IPO Data

transformed_ipo = transform_ipo(ipo)

Log successfully written to database
Log successfully written to database
Log successfully written to database
122 invalid IDs from table 'ipo' saved to logger DB.
Log successfully written to database
The data is successfully transformed


### Funds Data

In [89]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, to_date, broadcast, when
from src.utils.logger import log_to_db, save_invalid_ids
from src.utils.helper import extract_prefix, extract_id, clean_alpha_text
from datetime import datetime

def transform_funds(df):
    try:
        log_to_db({
            "step": "Transform",
            "status": "STARTED",
            "source": "Staging",
            "table_name": "funds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        df = df.na.replace("", None)

        df = df.withColumn("fund_entity_type", extract_prefix(col("object_id")))
        df = df.withColumn("fund_object_id", extract_id(col("object_id")).cast(IntegerType()))
        df = df.withColumn("funding_date", to_date(col("funded_at")))

        df_transformed = df.select(
            col("fund_id"),
            col("fund_entity_type"),
            col("fund_object_id"),
            col("name").alias("fund_name"),
            col("funding_date"),
            col("raised_currency_code").alias("raised_currency"),
            col("raised_amount"),
            col("source_url"),
            col("source_description"),
            col("created_at"),
            col("updated_at")
        )

        df_transformed = df_transformed.fillna({
            "raised_currency": "USD",
            "raised_amount": 0.0,
            "source_url": "Unknown",
            "source_description": "Unknown"
        })

        df_transformed = df_transformed.dropDuplicates(["fund_id"])
        df_transformed = df_transformed.na.drop(subset=["funding_date"])
        df_transformed = df_transformed.withColumn(
            "source_url", when(col("source_url").rlike(r"^(http|https)://.*"), col("source_url")).otherwise("Unknown")
        ).withColumn(
            "source_description", clean_alpha_text("source_description")
        )

        log_to_db({
            "step": "Clean Data",
            "status": f"SUCCESS ({df_transformed.count()} rows after cleansing)",
            "source": "staging",
            "table_name": "funds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        print("The data is successfully transformed")
        
        return df_transformed


    except Exception as e:
        log_to_db({
            "step": "Transform",
            "status": f"FAILED - {str(e)}",
            "source": "staging",
            "table_name": "funds",
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })
        raise


In [90]:
# Read from staging

funds = spark.read \
    .format("jdbc") \
    .option("url", DB_STAGING_URL) \
    .option("dbtable", "funds") \
    .option("user", DB_STAGING_USER) \
    .option("password", DB_STAGING_PASS) \
    .option("driver", "org.postgresql.Driver") \
    .load()


In [91]:
# Transform funds Data

transformed_funds = transform_funds(funds)

Log successfully written to database
Log successfully written to database
The data is successfully transformed


## Load to Data Warehouse


In [30]:
def load_to_dwh(df, table_name, mode="overwrite", use_upsert=False, idx_name=None, schema=None, source=None):

    os.makedirs("logs", exist_ok=True)

    try:
        if not DWH_USER or not DWH_PASS:
            raise EnvironmentError("DWH_USER or DWH_PASS is not set")

        if use_upsert:
            data = df.toPandas()
            if idx_name is None:
                raise ValueError("Index name is required for upsert mode")
            data = data.set_index(idx_name)

            conn = create_engine(f"postgresql://{DWH_USER}:{DWH_PASS}@host.docker.internal:5432/pyspark_task_dwh")
            upsert(
                con=conn,
                df=data,
                table_name=table_name,
                schema=schema,
                if_row_exists="update"
            )
            print(f"Data upserted to table '{table_name}' successfully!")
        else:
            df.write \
              .format("jdbc") \
              .option("url", DWH_URL) \
              .option("dbtable", table_name) \
              .option("user", DWH_USER) \
              .option("password", DWH_PASS) \
              .option("driver", "org.postgresql.Driver") \
              .mode(mode) \
              .save()
            print(f"Data loaded to table '{table_name}' successfully!")

        log_msg = {
            "step": "Load to DWH",
            "status": "Success",
            "source": "transformed data",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

    except Exception as e:
        print(f"Error loading data to table '{table_name}': {e}")
        failed_data = data if use_upsert else df.toPandas()
        failed_data['error_message'] = str(e)
        failed_data['etl_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        log_msg = {
            "step": "Load to DWH",
            "status": "Failed",
            "source": "transformed data",
            "table_name": table_name,
            "etl_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "error_message": str(e)
        }

        # failed_log_path = f'logs/failed_{table_name}_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
        # failed_data.to_csv(failed_log_path, index=False)
        # print(f"Failed data saved to: {failed_log_path}")

    finally:
        log_msg.pop("error_message", None)
        log_to_db(log_msg)

    return df if not use_upsert else data


In [None]:
# dim_company

load_to_dwh(transformed_company, table_name="dim_company", use_upsert=True, idx_name="company_id", schema="public")

In [None]:
# dim_people

load_to_dwh(transformed_people, table_name="dim_people", use_upsert=True, idx_name="people_id", schema="public")

In [None]:
# dim_milestones

load_to_dwh(transformed_milestones, table_name="dim_milestones", use_upsert=True, idx_name="people_id", schema="public")

In [75]:
# fact_acquisition

load_to_dwh(transformed_acquisition, table_name="fact_acquisition", use_upsert=True, idx_name="acquisition_id", schema="public")

Data upserted to table 'fact_acquisition' successfully!
Log successfully written to database


Unnamed: 0_level_0,acquiring_entity_type,acquired_entity_type,price_amount,price_currency_code,acquired_at,source_url,created_at,updated_at,acquiring_object_id,acquired_object_id
acquisition_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,c,c,20000000.00,USD,2007-05-30,http://venturebeat.com/2007/05/30/fox-interact...,2007-05-31 22:19:54,2008-05-21 19:23:44,11,10
7,c,c,60000000.00,USD,2007-07-01,http://www.techcrunch.com/2007/07/02/deal-is-c...,2007-07-03 08:14:50,2011-05-06 21:51:05,59,72
8,c,c,280000000.00,USD,2007-05-01,http://www.techcrunch.com/2007/05/30/cbs-acqui...,2007-07-12 04:19:24,2008-05-19 04:48:50,24,132
9,c,c,100000000.00,USD,2007-06-01,http://techcrunch.com/2007/05/23/100-million-p...,2007-07-13 09:52:59,2012-06-05 03:22:17,59,155
10,c,c,25000000.00,USD,2007-07-01,http://blog.seattlepi.nwsource.com/venture/arc...,2007-07-20 05:29:07,2008-02-25 00:23:47,212,215
...,...,...,...,...,...,...,...,...,...,...
10103,c,c,31000000.00,USD,2012-07-26,http://architectpartners.com/ma_alert/netgear-...,2013-10-20 13:58:14,2013-10-20 13:58:14,16675,11888
10275,c,c,30000000.00,USD,2013-11-10,http://techcrunch.com/2013/11/10/vox-buys-curb...,2013-11-11 03:26:04,2013-11-11 05:01:18,12906,1254
10427,c,c,200000000.00,USD,2013-12-02,http://techcrunch.com/2013/12/02/apple-buys-to...,2013-12-02 23:44:12,2013-12-02 23:44:12,1654,23588
10486,c,c,350000000.00,USD,2013-12-09,http://techcrunch.com/2013/12/09/verizon-confi...,2013-12-09 14:01:38,2013-12-09 14:01:38,4843,1587


In [None]:
# fact_investments

load_to_dwh(transformed_investments, table_name="fact_investments", use_upsert=True, idx_name="investment_id", schema="public")

In [81]:
# dim_funding_rounds

load_to_dwh(transformed_funding_rounds, table_name="dim_funding_rounds", use_upsert=True, idx_name="funding_round_id", schema="public")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Unnamed: 0_level_0,funding_entity_type,funding_object_id,round_type,funding_date,raised_currency,raised_amount,raised_amount_usd,pre_money_currency,pre_money_valuation,pre_money_valuation_usd,...,post_money_valuation_usd,participants,source_url,source_description,created_at,updated_at,company_object_id,people_object_id,error_message,etl_date
funding_round_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12,c,16,series-b,2007-06-01,USD,40000000.00,40000000.00,USD,0.00,0.00,...,0.00,4,http://www.socaltech.com/slacker_raises_4_m/s-...,Unknown,2007-06-04 08:48:44,2008-11-20 02:59:19,16,16,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
27,c,33,series-a,2006-05-01,USD,2500000.00,2500000.00,USD,0.00,0.00,...,0.00,2,http://venturebeat.com/2007/03/06/widget-compa...,Widget company Clearspring says it leads marke...,2007-06-13 09:04:50,2013-09-06 11:20:07,34,34,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
28,c,33,series-b,2007-02-01,USD,5500000.00,5500000.00,USD,0.00,0.00,...,15500000.00,6,http://venturebeat.com/2007/03/06/widget-compa...,Widget company Clearspring says it leads marke...,2007-06-13 09:05:28,2013-09-06 11:20:07,34,34,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
31,c,34,series-a,2007-06-01,USD,5000000.00,5000000.00,USD,0.00,0.00,...,0.00,4,http://www.techcrunch.com/2007/06/13/openads-o...,Unknown,2007-06-13 18:26:31,2013-07-18 17:28:21,35,35,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
34,c,39,series-c+,2007-08-01,USD,25000000.00,25000000.00,USD,0.00,0.00,...,0.00,5,http://newteevee.com/2007/06/14/veoh-goes-for-...,Veoh Goes for the Big Money,2007-06-15 09:44:28,2008-06-03 19:26:18,40,40,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57859,c,19206,venture,2010-08-18,USD,3111261.00,3111261.00,USD,0.00,0.00,...,0.00,0,http://www.sec.gov/Archives/edgar/data/1418028...,SEC,2013-12-12 07:44:35,2013-12-12 07:44:35,27385,27385,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
57862,c,19206,series-b,2010-03-30,USD,9009000.00,9009000.00,USD,0.00,0.00,...,0.00,0,http://www.sec.gov/Archives/edgar/data/1418028...,SEC,2013-12-12 07:48:10,2013-12-12 07:48:10,27385,27385,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
57887,c,109446,other,2010-02-01,USD,5000000.00,5000000.00,USD,0.00,0.00,...,0.00,0,http://www.sec.gov/Archives/edgar/data/1421000...,SEC,2013-12-12 11:01:46,2013-12-12 11:01:46,238878,238878,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33
57908,c,105384,other,2010-04-14,USD,3475000.00,3475000.00,USD,0.00,0.00,...,0.00,0,http://www.sec.gov/Archives/edgar/data/1083672...,SEC,2013-12-12 11:50:10,2013-12-12 11:50:10,232285,232285,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 03:56:33


In [93]:
# fact_relationship

load_to_dwh(transformed_relationship, table_name="fact_relationship", use_upsert=True, idx_name="relationship_id", schema="public")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



Log successfully written to database


Unnamed: 0_level_0,people_entity_type,relationship_entity_type,relationship_object_id,start_at,end_at,title,created_at,updated_at,company_object_id,error_message,etl_date
relationship_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10009,p,c,3043,,,vp product,2008-04-10 03:53:18,2008-04-10 03:53:18,4125,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
10010,p,c,3043,,,vp engineering,2008-04-10 03:53:18,2008-04-10 03:53:18,4125,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
100262,p,c,91323,2010-01-01,,board of directors,2011-02-01 19:38:26,2011-07-14 21:14:52,32540,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
100263,p,c,91323,2010-09-01,,board of directors,2011-02-01 19:38:27,2013-07-04 03:59:15,32540,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
100488,p,c,91990,2010-08-01,2012-09-01,board member,2011-02-03 07:24:59,2013-06-26 08:17:00,67806,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
...,...,...,...,...,...,...,...,...,...,...,...
99881,p,c,29018,2011-01-01,2012-01-17,advisory board member,2011-01-29 07:45:25,2012-01-16 11:44:14,40335,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
99916,p,c,6534,,,cfo executive vp,2011-01-29 16:12:24,2013-06-07 22:42:31,7060,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
99950,p,c,106183,2010-08-10,,director,2011-01-30 00:03:04,2011-02-02 19:25:41,39162,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25
99964,p,f,330,,,Unknown,2011-01-30 04:12:05,2011-02-02 19:26:01,336,"(psycopg2.errors.UndefinedColumn) column ""comp...",2025-04-20 04:11:25


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 52852)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =

In [77]:
# fact_ipo

load_to_dwh(transformed_ipo, table_name="fact_ipo", use_upsert=True, idx_name="ipo_id", schema="public")

Data upserted to table 'fact_ipo' successfully!
Log successfully written to database


Unnamed: 0_level_0,ipo_entity_type,ipo_object_id,valuation_currency,valuation_amount,raised_currency,raised_amount,public_at,stock_market,stock_symbol,source_url,source_description,created_at,updated_at
ipo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,c,1624,USD,0.00,USD,0.00,1980-12-19,NASDAQ,AAPL,Unknown,Unknown,2008-02-09 05:17:45,2012-04-12 04:02:59
1002,c,1045,USD,0.00,USD,65000000.00,2013-07-02,NYSE,YUME,http://techcrunch.com/2013/07/02/yume-ipo/,Main Event Page news Comment 0 inShare9 Anothe...,2013-07-03 00:00:56,2013-07-03 00:00:56
1009,c,107162,USD,0.00,USD,128000000.00,2013-08-08,NASDAQ,FOXF,http://www.bizjournals.com/sanjose/news/2013/0...,"Fox raises $128M in public debut, ends day up 24%",2013-07-09 04:24:09,2013-08-09 03:18:34
101,c,94033,USD,0.00,USD,0.00,2009-04-02,,,Unknown,Unknown,2009-04-06 19:46:21,2009-04-06 19:46:21
1011,c,107179,USD,0.00,USD,0.00,,OTCQB,AVXL,Unknown,Unknown,2013-07-09 06:08:51,2013-07-09 06:08:51
...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,c,53630,USD,0.00,USD,80000000.00,2013-05-10,NYSE,BIOA,http://www.biofuelsdigest.com/bdigest/2013/05/...,BioAmber completes IPO,2013-06-21 04:22:05,2013-06-21 04:22:05
982,c,17538,USD,0.00,USD,947000000.00,2013-05-09,NYSE,Q,http://uk.reuters.com/article/2013/05/09/us-qu...,Quintiles IPO raises more-than-planned $947 mi...,2013-06-21 04:40:55,2013-06-21 04:48:52
984,c,31572,USD,0.00,USD,0.00,,NASDAQ,ECTE,Unknown,Unknown,2013-06-21 06:31:30,2013-06-21 06:31:30
988,c,82649,USD,0.00,USD,0.00,,NYSE,RAD,Unknown,Unknown,2013-06-22 03:55:00,2013-06-22 03:55:00


In [92]:
# dim_funds

load_to_dwh(transformed_funds, table_name="dim_funds", use_upsert=True, idx_name="fund_id", schema="public")

Data upserted to table 'dim_funds' successfully!
Log successfully written to database


Unnamed: 0_level_0,fund_entity_type,fund_object_id,fund_name,funding_date,raised_currency,raised_amount,source_url,source_description,created_at,updated_at
fund_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,f,371,Second Fund,2008-12-16,USD,300000000.00,http://www.pehub.com/26194/dfj-dragon-raising-...,peHub,2008-12-17 03:07:16,2008-12-17 03:07:16
100,f,260,TA XI,2009-08-12,USD,4000000000.00,http://www.preqin.com/item/ta-associates-close...,TA Associates Closes 4B Fund,2009-11-02 22:16:07,2009-11-02 22:16:07
1002,f,10437,Lightspeed China Partners I,2013-01-29,USD,168000000.00,http://www.finsmes.com/2013/01/lightspeed-chin...,Lightspeed China Partners Closes 168M Fund,2013-01-29 02:32:22,2013-01-29 02:32:22
1005,f,8154,Serent Capital II,2013-01-30,USD,350000000.00,http://www.finsmes.com/2013/01/serent-capital-...,Serent Capital Closes Second Fund at 350M,2013-01-31 02:50:10,2013-01-31 02:50:10
1006,f,1866,Cleantech Fund II,2013-01-30,USD,74000000.00,http://pevc.dowjones.com/article?an=DJFVW00020...,Israel Cleantech Ventures Closes 74M Second Fund,2013-01-31 03:41:42,2013-01-31 03:41:42
...,...,...,...,...,...,...,...,...,...,...
991,f,13,Redpoint V,2013-01-16,USD,400000000.00,http://techcrunch.com/2013/01/16/redpoint-vent...,Redpoint Ventures Closes On 400 Million For It...,2013-01-17 02:37:25,2013-01-17 02:57:59
992,f,9467,Disruptive Innovation Fund,2007-08-01,USD,0.00,Unknown,Unknown,2013-01-18 06:39:20,2013-01-21 03:14:34
993,f,9913,Venture capital fund,2013-01-23,USD,100000000.00,http://venturebeat.com/2013/01/22/ribbit-capit...,Ribbit Capital lands 100M fund for financefocu...,2013-01-23 03:38:54,2013-01-23 03:40:52
994,f,929,Prime Ventures IV,2013-01-24,EUR,100000000.00,http://www.finsmes.com/2013/01/prime-ventures-...,Prime Ventures Holds First Close VC Fund IV at...,2013-01-25 11:01:18,2013-01-25 11:01:23
