In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Incremental_Load_Example").getOrCreate()



In [0]:
# Define the initial data
initial_data = [
    (1, "Alice", "2024-08-01 10:00:00"),
    (2, "Bob", "2024-08-02 11:30:00"),
    (3, "Charlie", "2024-08-03 14:15:00"),
    (4, "David", "2024-08-04 09:45:00"),
    (5, "Eva", "2024-08-05 13:00:00")
]

# Create a DataFrame
columns = ["id", "name", "last_modified"]
df_initial = spark.createDataFrame(initial_data, columns)

# Write the initial data to a Delta table
df_initial.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/initial_load")


In [0]:
df_initial = df_initial.withColumn("last_modified", col("last_modified").cast("timestamp"))

# Get the watermark value (latest timestamp)
watermark_df = df_initial.selectExpr("max(last_modified) as max_last_modified")

# Write the watermark to another Delta table
watermark_df.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")


In [0]:
# Define the incremental data
incremental_data = [
    (2, "Bob Smith", "2024-08-06 11:30:00"),  # Updated record for id=2
    (5, "Eva", "2024-08-07 13:00:00"),        # Updated record for id=5
    (6, "Frank", "2024-08-07 15:00:00"),      # New record
    (7, "Grace", "2024-08-07 16:00:00")       # New record
]

# Create a DataFrame
df_incremental = spark.createDataFrame(incremental_data, columns)

df_incremental = df_incremental.withColumn("last_modified", col("last_modified").cast("timestamp"))


In [0]:
# Read the last watermark
last_load_timestamp = spark.read.format("delta").load("dbfs:/user/hive/warehouse/watermark_table").selectExpr("max(max_last_modified)").collect()[0][0]

# Filter the incremental data based on the last load timestamp
new_changes_df = df_incremental.filter(f"last_modified > '{last_load_timestamp}'")


In [0]:
from delta.tables import DeltaTable

# Load the existing Delta table
deltaTable = DeltaTable.forPath(spark, "dbfs:/user/hive/warehouse/initial_load")

# Merge new changes into the existing Delta table
deltaTable.alias("target").merge(
    new_changes_df.alias("source"),
    "target.id = source.id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()


In [0]:
# Update the watermark with the new last_modified
new_last_modified = new_changes_df.selectExpr("max(last_modified) as max_last_modified").collect()[0][0]

# Save the new watermark
spark.createDataFrame([(new_last_modified,)], ["max_last_modified"]).write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Incremental_Load_Example").getOrCreate()

# Define the initial data
initial_data = [
    (1, "Alice", "2024-08-01 10:00:00"),
    (2, "Bob", "2024-08-02 11:30:00"),
    (3, "Charlie", "2024-08-03 14:15:00"),
    (4, "David", "2024-08-04 09:45:00"),
    (5, "Eva", "2024-08-05 13:00:00")
]

# Create a DataFrame
columns = ["id", "name", "last_modified"]
df_initial = spark.createDataFrame(initial_data, columns)

# Write the initial data to a Delta table
df_initial.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/initial_load")

df_initial = df_initial.withColumn("last_modified", col("last_modified").cast("timestamp"))

# Get the watermark value (latest timestamp)
watermark_df = df_initial.selectExpr("max(last_modified) as max_last_modified")

# Write the watermark to another Delta table
watermark_df.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")

# Define the incremental data
incremental_data = [
    (2, "Bob Smith", "2024-08-06 11:30:00"),  # Updated record for id=2
    (5, "Eva", "2024-08-07 13:00:00"),        # Updated record for id=5
    (6, "Frank", "2024-08-07 15:00:00"),      # New record
    (7, "Grace", "2024-08-07 16:00:00")       # New record
]

# Create a DataFrame
df_incremental = spark.createDataFrame(incremental_data, columns)

df_incremental = df_incremental.withColumn("last_modified", col("last_modified").cast("timestamp"))

# Read the last watermark
last_load_timestamp = spark.read.format("delta").load("dbfs:/user/hive/warehouse/watermark_table").selectExpr("max(max_last_modified)").collect()[0][0]

# Filter the incremental data based on the last load timestamp
new_changes_df = df_incremental.filter(f"last_modified > '{last_load_timestamp}'")

from delta.tables import DeltaTable

# Load the existing Delta table
deltaTable = DeltaTable.forPath(spark, "dbfs:/user/hive/warehouse/initial_load")

# Merge new changes into the existing Delta table
deltaTable.alias("target").merge(
    new_changes_df.alias("source"),
    "target.id = source.id"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()


# Update the watermark with the new last_modified
new_last_modified = new_changes_df.selectExpr("max(last_modified) as max_last_modified").collect()[0][0]

# Save the new watermark
spark.createDataFrame([(new_last_modified,)], ["max_last_modified"]).write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Inc Load").getOrCreate()

file_path = "/FileStore/tables/Bitcoin_Merged_Data.csv"
df_initial = spark.read.option("header", True).csv(file_path)
df_initial.show()
df_initial.columns

+--------------------+-------+------+-------------+
|           Timestamp|  Price|Volume|   Market Cap|
+--------------------+-------+------+-------------+
|2013-11-30 00:00:...|1127.45|   0.0|13593664650.0|
|2013-12-01 00:00:...|1033.39|   0.0|12464724345.0|
|2013-12-02 00:00:...| 974.03|   0.0|11753157346.0|
|2013-12-03 00:00:...|1078.64|   0.0|13020236474.0|
|2013-12-04 00:00:...|1121.48|   0.0|13542487814.0|
|2013-12-05 00:00:...| 989.04|   0.0|11947405392.0|
|2013-12-06 00:00:...|1019.66|   0.0|12322234219.0|
|2013-12-07 00:00:...| 811.98|   0.0| 9816777302.0|
|2013-12-08 00:00:...| 743.11|   0.0| 8987358118.0|
|2013-12-09 00:00:...| 897.89|   0.0|10863885372.0|
|2013-12-10 00:00:...| 927.78|   0.0|11230173843.0|
|2013-12-11 00:00:...| 887.08|   0.0|10741252534.0|
|2013-12-12 00:00:...| 878.17|   0.0|10636987805.0|
|2013-12-13 00:00:...| 897.27|   0.0|10872198158.0|
|2013-12-14 00:00:...| 866.99|   0.0|10509219285.0|
|2013-12-15 00:00:...| 854.62|   0.0|10363485334.0|
|2013-12-16 

In [0]:
# Rename columns to remove invalid characters
df_initial = df_initial.withColumnRenamed("Market Cap", "Market_Cap")

# Now you can safely cast the Timestamp_ column to a timestamp type
#df_initial = df_initial.withColumn("Timestamp_", col("Timestamp_").cast("timestamp"))

# Write the DataFrame to a Delta table
#df_initial.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/load")


In [0]:
# Calculate the watermark (latest timestamp)
watermark_df = df_initial.selectExpr("max(Timestamp) as max_last_modified")

# Write the watermark value to another Delta table
watermark_df.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")


In [0]:
# Define the incremental data
incremental_data = [
    ("2024-08-06 11:30:00", 15000, 1.5, 300000),
    ("2024-08-07 13:00:00", 20000, 2.0, 400000),
    ("2024-08-07 15:00:00", 30000, 3.0, 600000),
    ("2024-08-07 16:00:00", 25000, 2.5, 500000)
]

columns = ["Timestamp", "Price", "Volume", "Market Cap"]

# Create a DataFrame with the incremental data
df_incremental = spark.createDataFrame(incremental_data, columns)

# Cast the Timestamp column to a timestamp type
df_incremental = df_incremental.withColumn("Timestamp", col("Timestamp").cast("timestamp"))



In [0]:
# Read the last watermark
last_load_timestamp = spark.read.format("delta").load("dbfs:/user/hive/warehouse/watermark_table").selectExpr("max(max_last_modified)").collect()[0][0]

# Filter the incremental data based on the last load timestamp
new_changes_df = df_incremental.filter(col("Timestamp") > last_load_timestamp)


In [0]:
# Rename columns to remove spaces or special characters in both DataFrames
df_initial = df_initial.withColumnRenamed("Market Cap", "Market_Cap")
new_changes_df = new_changes_df.withColumnRenamed("Market Cap", "Market_Cap")

# Load the Delta table
deltaTable = DeltaTable.forPath(spark, "dbfs:/user/hive/warehouse/load")

# Merge new changes into the existing Delta table
deltaTable.alias("target").merge(
    new_changes_df.alias("source"),
    "target.Timestamp = source.Timestamp"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()



In [0]:
# Update the watermark with the new last_modified timestamp
new_last_modified = new_changes_df.selectExpr("max(Timestamp) as max_last_modified").collect()[0][0]
spark.createDataFrame([(new_last_modified,)], ["max_last_modified"]).write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
#Creating a SparkSession
spark = SparkSession.builder.appName("Inc Load").getOrCreate()

#Reading the CSV File into a DataFrame
file_path = "/FileStore/tables/Bitcoin_Merged_Data.csv"
df_initial = spark.read.option("header", True).csv(file_path)
df_initial.show()

# Rename columns to remove invalid characters
df_initial = df_initial.withColumnRenamed("Market Cap", "Market_Cap")

# Now you can safely cast the Timestamp_ column to a timestamp type
df_initial = df_initial.withColumn("Timestamp", col("Timestamp").cast("timestamp"))

# Write the DataFrame to a Delta table
df_initial.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/load")

# Calculate the watermark (latest timestamp)
watermark_df = df_initial.selectExpr("max(Timestamp) as max_last_modified")

# Write the watermark value to another Delta table
watermark_df.write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")

# Define the incremental data
incremental_data = [
    ("2024-08-06 11:30:00", 15000, 1.5, 300000),
    ("2024-08-07 13:00:00", 20000, 2.0, 400000),
    ("2024-08-07 15:00:00", 30000, 3.0, 600000),
    ("2024-08-07 16:00:00", 25000, 2.5, 500000)
]

columns = ["Timestamp", "Price", "Volume", "Market Cap"]

# Create a DataFrame with the incremental data
df_incremental = spark.createDataFrame(incremental_data, columns)

# Cast the Timestamp column to a timestamp type
df_incremental = df_incremental.withColumn("Timestamp", col("Timestamp").cast("timestamp"))

# Read the last watermark
last_load_timestamp = spark.read.format("delta").load("dbfs:/user/hive/warehouse/watermark_table").selectExpr("max(max_last_modified)").collect()[0][0]

# Filter the incremental data based on the last load timestamp
new_changes_df = df_incremental.filter(col("Timestamp") > last_load_timestamp)

# Rename columns to remove spaces or special characters in both DataFrames
df_initial = df_initial.withColumnRenamed("Market Cap", "Market_Cap")
new_changes_df = new_changes_df.withColumnRenamed("Market Cap", "Market_Cap")

# Load the Delta table
deltaTable = DeltaTable.forPath(spark, "dbfs:/user/hive/warehouse/load")

# Merge new changes into the existing Delta table
deltaTable.alias("target").merge(
    new_changes_df.alias("source"),
    "target.Timestamp = source.Timestamp"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

# Update the watermark with the new last_modified timestamp
new_last_modified = new_changes_df.selectExpr("max(Timestamp) as max_last_modified").collect()[0][0]
spark.createDataFrame([(new_last_modified,)], ["max_last_modified"]).write.format("delta").mode("overwrite").save("dbfs:/user/hive/warehouse/watermark_table")

deltaTable.toDF().show()

deltaTable.toDF().printSchema()



+--------------------+-------+------+-------------+
|           Timestamp|  Price|Volume|   Market Cap|
+--------------------+-------+------+-------------+
|2013-11-30 00:00:...|1127.45|   0.0|13593664650.0|
|2013-12-01 00:00:...|1033.39|   0.0|12464724345.0|
|2013-12-02 00:00:...| 974.03|   0.0|11753157346.0|
|2013-12-03 00:00:...|1078.64|   0.0|13020236474.0|
|2013-12-04 00:00:...|1121.48|   0.0|13542487814.0|
|2013-12-05 00:00:...| 989.04|   0.0|11947405392.0|
|2013-12-06 00:00:...|1019.66|   0.0|12322234219.0|
|2013-12-07 00:00:...| 811.98|   0.0| 9816777302.0|
|2013-12-08 00:00:...| 743.11|   0.0| 8987358118.0|
|2013-12-09 00:00:...| 897.89|   0.0|10863885372.0|
|2013-12-10 00:00:...| 927.78|   0.0|11230173843.0|
|2013-12-11 00:00:...| 887.08|   0.0|10741252534.0|
|2013-12-12 00:00:...| 878.17|   0.0|10636987805.0|
|2013-12-13 00:00:...| 897.27|   0.0|10872198158.0|
|2013-12-14 00:00:...| 866.99|   0.0|10509219285.0|
|2013-12-15 00:00:...| 854.62|   0.0|10363485334.0|
|2013-12-16 