# Delta Lake for ETL

In [1]:
import requests
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from delta import *

In [3]:
builder = SparkSession.builder.master("local[4]").appName("parallel") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [24]:
# Define the Delta table path
delta_table_path = "/data/github_data"

# Step 1: Fetch data from GitHub API
def fetch_github_data():
    # GitHub API URL
    repo = "apache/spark"
    end_date = datetime.now()
    start_date = end_date - timedelta(days=1)
    params = {
        "state": "all",
        "since": start_date.isoformat(),
        "per_page": 100
    }
    
    issues_url = f"https://api.github.com/repos/{repo}/issues"
    prs_url = f"https://api.github.com/repos/{repo}/pulls"
    
    headers = {
        "Accept": "application/vnd.github.v3+json"
    }
    
    issues_response = requests.get(issues_url, headers=headers, params=params)
    prs_response = requests.get(prs_url, headers=headers, params=params)
    
    issues = issues_response.json()
    prs = prs_response.json()
    
    return issues, prs

In [25]:
issues, prs = fetch_github_data()

In [26]:
len(issues), len(prs)

(41, 100)

In [5]:
# Step 2: Transform data
def transform_data(issues, prs):
    issues_df = spark.createDataFrame(issues)
    prs_df = spark.createDataFrame(prs)
    
    # Add a new column to differentiate between issues and PRs
    issues_df = issues_df.withColumn("type", lit("issue"))
    prs_df = prs_df.withColumn("type", lit("pull_request"))
    
    # Union the dataframes
    df = issues_df.union(prs_df)
    
    # Basic data cleaning: remove duplicates
    df = df.dropDuplicates()
    
    return df

In [27]:
df = transform_data(issues, prs)

PySparkValueError: [CANNOT_DETERMINE_TYPE] Some of types cannot be determined after inferring.

In [6]:
# Step 3: Load data to Delta Lake
def load_data_to_delta(df):
    # Check if the Delta table exists
    if not DeltaTable.isDeltaTable(spark, delta_table_path):
        # If not, create the table
        df.write.format("delta").mode("overwrite").save(delta_table_path)
        DeltaTable.createOrReplace(spark).tableName("github_data").location(delta_table_path).execute()
    else:
        # If it exists, merge the new data
        delta_table = DeltaTable.forPath(spark, delta_table_path)
        delta_table.alias("old_data").merge(
            df.alias("new_data"),
            "old_data.id = new_data.id"
        ).whenNotMatchedInsertAll().execute()

In [7]:
# Step 4: Demonstrate Benefits
def demonstrate_benefits():
    # Reliability: ACID transactions ensure data consistency
    df = spark.read.format("delta").load(delta_table_path)
    df.createOrReplaceTempView("github_data")
    
    # Scalability: handle large volumes of data efficiently
    total_count = spark.sql("SELECT COUNT(*) FROM github_data").collect()[0][0]
    print(f"Total records: {total_count}")
    
    # Schema Enforcement and Evolution: demonstrate adding a new column
    df = df.withColumn("processed_at", lit(datetime.now().isoformat()))
    df.write.format("delta").mode("overwrite").save(delta_table_path)
    
    # Query Performance: optimized for fast data retrieval
    recent_issues = spark.sql("SELECT * FROM github_data WHERE type = 'issue' AND created_at >= date_sub(current_date(), 7)")
    recent_issues.show()

In [None]:
if __name__ == "__main__":
    issues, prs = fetch_github_data()
    df = transform_data(issues, prs)
    load_data_to_delta(df)
    demonstrate_benefits()