# Delta Lake Clone - Testing

from [the databricks docs](https://docs.databricks.com/en/delta/clone.html#language-python)

## write some delta code

This code fetches PR and user data from a Github repo (`delta-io/delta`), joins users to PRs and removes duplicates.

In [1]:
import requests
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from delta import *

In [2]:
builder = SparkSession.builder.master("local[4]").appName("parallel") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 55730)
Traceback (most recent call last):
  File "/Users/rpelgrim/miniforge3/envs/pyspark-350-delta-320/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/rpelgrim/miniforge3/envs/pyspark-350-delta-320/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/Users/rpelgrim/miniforge3/envs/pyspark-350-delta-320/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/rpelgrim/miniforge3/envs/pyspark-350-delta-320/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/Users/rpelgrim/miniforge3/envs/pyspark-350-delta-320/lib/python3.11/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/Users/r

In [3]:
# Define the Delta table path
delta_table_path = "/data/github_data"

# Step 1: Fetch data from GitHub API
def fetch_github_data():
    # GitHub API URL
    repo = "delta-io/delta"
    end_date = datetime.now()
    start_date = end_date - timedelta(days=1)
    params = {
        "state": "all",
        "since": start_date.isoformat(),
        "per_page": 100
    }
    
    prs_url = f"https://api.github.com/repos/{repo}/pulls"
    prs_response = requests.get(prs_url, params=params)
    prs = prs_response.json()
    
    return prs #, issues

In [4]:
prs = fetch_github_data()

In [5]:
prs_users = []
user_keys = ['login', 'id']

for pr in prs:
    pr_user = pr['user']
    try:
        pr_user_sub = {k: pr_user[k] for k in user_keys}
    except:
        pass
    prs_users.append(pr_user_sub)

users_df = spark.createDataFrame(prs_users)
print(f"{users_df.count()} users made PRs in this timeframe.")

# Remove duplicates
users_df = users_df.dropDuplicates()
print(f"Unique users: {users_df.count()}")

100 users made PRs in this timeframe.
Unique users: 40


In [6]:
# Select relevant fields
keys_to_include = ['id', 'number', 'title', 'body']
prs_simple = []

# iterate over list of prs
for pr in prs:
    pr_subset = {k: pr[k] for k in keys_to_include}
    pr_subset['user_id'] = pr['user']['id']
    prs_simple.append(pr_subset)

# Create DataFrame
prs_df = spark.createDataFrame(prs_simple)

In [7]:
prs_df.show()

+--------------------+----------+------+--------------------+---------+
|                body|        id|number|               title|  user_id|
+--------------------+----------+------+--------------------+---------+
|#### Which Delta ...|2001658776|  3470|[Spark] Uses java...|  1597914|
|<!--\r\nThanks fo...|2001644490|  3469|populate Delta cl...|  1174914|
|<!--\r\nThanks fo...|2001045563|  3467|[Spark] Add Row T...|107926660|
|<!--\r\nThanks fo...|2000308883|  3466|[Kernel] Configur...|   271029|
|#### Which Delta ...|2000227884|  3465|[Spark] Fix Delta...|135709731|
|<!--\r\nThanks fo...|1999340541|  3464|[WIP][KERNEL][VAR...| 87336575|
|Cherry-pick 03bdf...|1998901690|  3463|[3.2 Cherry Pick]...| 59617782|
|Cherry-pick 03bdf...|1998901641|  3462|[3.1 Cherry Pick]...| 59617782|
|Cherry-pick 03bdf...|1998901548|  3461|[3.0 Cherry Pick]...| 59617782|
|<!--\r\nThanks fo...|1998456564|  3460|Remove the `setSc...|  1134248|
|<!--\r\nThanks fo...|1998115737|  3459|[Spark] Support c...|173

In [8]:
joined = prs_df.alias("a").join(
    users_df.alias("b"),
    prs_df.user_id == users_df.id,
    how = "left"
).select("a.body", "a.id", "a.number", "a.title", "a.user_id", "b.login")

In [9]:
# Step 2: Transform data
def transform_data(prs):
    # Select relevant fields
    keys_to_include = ['id', 'number', 'title', 'body']
    prs_simple = []

    # iterate over list of prs
    for pr in prs:
        pr_subset = {k: pr[k] for k in keys_to_include}
        prs_simple.append(pr_subset)

    # Create DataFrame
    prs_df = spark.createDataFrame(prs_simple)

    # Create DataFrame with usernames and ids
    prs_users = []
    user_keys = ['login', 'id']
    
    for pr in prs:
        pr_user = pr['user']
        try:
            pr_user_sub = {k: pr_user[k] for k in user_keys}
        except:
            pass
        prs_users.append(pr_user_sub)

    users_df = spark.createDataFrame(prs_users)
    print(f"{users_df.count()} users made PRs in this timeframe.")
    
    # Remove duplicates
    users_df = users_df.dropDuplicates()
    print(f"Unique users: {users_df.count()}")
    
    prs_df = prs_df.dropDuplicates()
    
    return prs_df

In [10]:
prs_df = transform_data(prs)

100 users made PRs in this timeframe.
Unique users: 40


In [11]:
prs_df.show()

+--------------------+----------+------+--------------------+
|                body|        id|number|               title|
+--------------------+----------+------+--------------------+
|<!--\r\nThanks fo...|1999340541|  3464|[WIP][KERNEL][VAR...|
|## Description\r\...|1994920628|  3446|[Kernel][Clean up...|
|#### Which Delta ...|1994302327|  3442|[Spark] Make Conf...|
|<!--\r\nThanks fo...|1997863204|  3457|[Spark] Block uns...|
|## Description\r\...|1994406792|  3443|[Spark] Handle ty...|
|<!--\r\nThanks fo...|2000308883|  3466|[Kernel] Configur...|
|#### Which Delta ...|1997975330|  3458|[Spark] Add commi...|
|<!--\r\nThanks fo...|1996871547|  3453|Use correct parti...|
|<!--\r\nThanks fo...|1997396832|  3454|[Spark] Allow sta...|
|#### Which Delta ...|2001658776|  3470|[Spark] Uses java...|
|<!--\r\nThanks fo...|1998456564|  3460|Remove the `setSc...|
|## Description\r\...|1997838561|  3456|[Spark] Execute M...|
|Cherry-pick 03bdf...|1998901641|  3462|[3.1 Cherry Pick]...|
|<!--\r\

In [12]:
# LOAD data to Delta Table
prs_df.write.format("delta").mode("overwrite").save("data/github_data")

## Clone Delta Lake

In [None]:
# read Delta Lake

In [14]:
from delta.tables import *

In [15]:
deltaTable = DeltaTable.forPath(spark, "data/github_data/") 

In [16]:
!pwd

/Users/rpelgrim/Documents/git/delta-playground


In [18]:
spark.sql('CREATE TABLE delta.`/Users/rpelgrim/Documents/git/delta-playground/data/github_data_clone/` SHALLOW CLONE delta.`/Users/rpelgrim/Documents/git/delta-playground/data/github_data/`')

DataFrame[source_table_size: bigint, source_num_of_files: bigint, num_removed_files: bigint, num_copied_files: bigint, removed_files_size: bigint, copied_files_size: bigint]