In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, year, lag
from pyspark.sql.window import Window

In [2]:
# Initialize Spark session
#.config("spark.jars.packages", "net.snowflake:spark-snowflake_2.12:2.10.1-spark_3.1") \
# .config("spark.port.maxRetries", "50") \ # Avoid port conflicts \
# .config("spark.driver.memory", "8g")
spark = SparkSession.builder \
    .appName("TPC-DS Revenue and Net Profit Analysis") \
    .config("spark.driver.extraJavaOptions", "-Dio.netty.noPreferDirect=true") \
    .config("spark.executor.extraJavaOptions", "-Dio.netty.noPreferDirect=true") \
    .config("spark.jars.packages", "net.snowflake:spark-snowflake_2.13:3.0.0") \
    .config("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.13.33") \
    .config("spark.driver.memory", "4g") \
    .config("spark.port.maxRetries", "50") \
    .getOrCreate()

# Snowflake connection options
sfOptions = {
    "sfURL" : "https://pdxuwmc-wb15506.snowflakecomputing.com",
    "sfUser" : "Bob35",
    "sfPassword" : "Box35@snowf",
    "sfDatabase" : "SNOWFLAKE_SAMPLE_DATA",
    "sfSchema" : "TPCDS_SF10TCL",
    "sfRole" : "ACCOUNTADMIN", 
    
}

# Function to read Snowflake tables
'''
def read_snowflake_table(table_name):
    return spark.read \
        .format("snowflake") \
        .options(**sfOptions) \
        .option("dbtable", table_name) \
        .load()
'''
# Enhanced function to read Snowflake tables
# Enhanced function to read Snowflake tables
def read_snowflake_table(table_name=None, sql_string=None):
    options = sfOptions.copy()
    if sql_string:
        options["query"] = sql_string
    elif table_name:
        options["dbtable"] = table_name
    else:
        raise ValueError("Either table_name or sql_string must be provided.")
    
    return spark.read.format("snowflake").options(**options).load()

24/10/21 19:41:16 WARN Utils: Your hostname, B560F resolves to a loopback address: 127.0.1.1; using 192.168.68.56 instead (on interface enp4s0)
24/10/21 19:41:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/bob35/.ivy2/cache
The jars for the packages stored in: /home/bob35/.ivy2/jars
net.snowflake#snowflake-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-99542331-265a-4d40-8842-c91621ecee4d;1.0
	confs: [default]
	found net.snowflake#snowflake-jdbc;3.13.33 in central
:: resolution report :: resolve 94ms :: artifacts dl 3ms
	:: modules in use:
	net.snowflake#snowflake-jdbc;3.13.33 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-99542331-265a-4d40-8842-c91621ecee4d
	

# Load tables from TPCDS_SF10TCL

store_sales = read_snowflake_table("store_sales")
web_sales = read_snowflake_table("web_sales")
catalog_sales = read_snowflake_table("catalog_sales")
date_dim = read_snowflake_table("date_dim")

In [3]:
# Find the latest year with data in store_sales
# Load date_dim table to get the most recent year
date_dim = read_snowflake_table(table_name="date_dim")

# Find the latest year with data in store_sales
store_sales_latest_year = read_snowflake_table(sql_string="""
    SELECT max(d_year) as max_year
    FROM store_sales
    JOIN date_dim ON store_sales.ss_sold_date_sk = date_dim.d_date_sk
""").collect()[0]["MAX_YEAR"]

# Find the latest year with data in web_sales
web_sales_latest_year = read_snowflake_table(sql_string="""
    SELECT max(d_year) as max_year
    FROM web_sales
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
""").collect()[0]["MAX_YEAR"]

# Find the latest year with data in catalog_sales
catalog_sales_latest_year = read_snowflake_table(sql_string="""
    SELECT max(d_year) as max_year
    FROM catalog_sales
    JOIN date_dim ON catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
""").collect()[0]["MAX_YEAR"]

# Determine the overall latest year with data
latest_year_with_data = min(store_sales_latest_year, web_sales_latest_year, catalog_sales_latest_year)
print(f"Latest Year with Data: {latest_year_with_data}")

24/10/21 19:41:23 WARN SnowflakeConnectorUtils$: Query pushdown is not supported because you are using Spark 3.5.3 with a connector designed to support Spark 3.1. Either use the version of Spark supported by the connector or install a version of the connector that supports your version of Spark.
24/10/21 19:41:23 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 19:41:25 WARN SnowflakeConnectorUtils$: Query pushdown is not supported because you are using Spark 3.5.3 with a connector designed to support Spark 3.1. Either use the version of Spark supported by the connector or install a version of the connector that supports your version of Spark.
24/10/21 19:41:25 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 19:41:26 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 19:41:38 WARN Sn

Latest Year with Data: 2003


In [None]:
# Filter data to include only the nearest 3 or 5 years
years_to_load = 2  # Change to 3 if you want to load the nearest 3 years
date_dim_filtered = date_dim.filter(col("d_year") >= (latest_year_with_data - years_to_load + 1))

# Load store_sales, web_sales, and catalog_sales tables with filtering
store_sales_query = f"""
    SELECT * FROM store_sales 
    WHERE ss_sold_date_sk IN 
    (SELECT d_date_sk FROM date_dim WHERE d_year >= {latest_year_with_data - years_to_load + 1})
"""
store_sales = read_snowflake_table(sql_string=store_sales_query).alias("store_sales")

web_sales_query = f"""
    SELECT * FROM web_sales 
    WHERE ws_sold_date_sk IN 
    (SELECT d_date_sk FROM date_dim WHERE d_year >= {latest_year_with_data - years_to_load + 1})
"""
web_sales = read_snowflake_table(sql_string=web_sales_query).alias("web_sales")

catalog_sales_query = f"""
    SELECT * FROM catalog_sales 
    WHERE cs_sold_date_sk IN 
    (SELECT d_date_sk FROM date_dim WHERE d_year >= {latest_year_with_data - years_to_load + 1})
"""
catalog_sales = read_snowflake_table(sql_string=catalog_sales_query).alias("catalog_sales")

# Verify the data loading
print("Store Sales Data:")
store_sales.show(5)
print("Web Sales Data:")
web_sales.show(5)
print("Catalog Sales Data:")
catalog_sales.show(5)
print("Date Dimension Data:")
date_dim_filtered.show(5)

24/10/21 19:41:59 WARN SnowflakeConnectorUtils$: Query pushdown is not supported because you are using Spark 3.5.3 with a connector designed to support Spark 3.1. Either use the version of Spark supported by the connector or install a version of the connector that supports your version of Spark.
24/10/21 19:41:59 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 19:41:59 WARN SnowflakeConnectorUtils$: Query pushdown is not supported because you are using Spark 3.5.3 with a connector designed to support Spark 3.1. Either use the version of Spark supported by the connector or install a version of the connector that supports your version of Spark.
24/10/21 19:41:59 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 19:42:00 WARN SnowflakeConnectorUtils$: Query pushdown is not supported because you are using Spark 3.5.3 with a connector designed to support 

Store Sales Data:


24/10/21 19:42:00 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.


In [5]:
date_dim.printSchema()

root
 |-- D_DATE_SK: decimal(38,0) (nullable = false)
 |-- D_DATE_ID: string (nullable = false)
 |-- D_DATE: date (nullable = true)
 |-- D_MONTH_SEQ: decimal(38,0) (nullable = true)
 |-- D_WEEK_SEQ: decimal(38,0) (nullable = true)
 |-- D_QUARTER_SEQ: decimal(38,0) (nullable = true)
 |-- D_YEAR: decimal(38,0) (nullable = true)
 |-- D_DOW: decimal(38,0) (nullable = true)
 |-- D_MOY: decimal(38,0) (nullable = true)
 |-- D_DOM: decimal(38,0) (nullable = true)
 |-- D_QOY: decimal(38,0) (nullable = true)
 |-- D_FY_YEAR: decimal(38,0) (nullable = true)
 |-- D_FY_QUARTER_SEQ: decimal(38,0) (nullable = true)
 |-- D_FY_WEEK_SEQ: decimal(38,0) (nullable = true)
 |-- D_DAY_NAME: string (nullable = true)
 |-- D_QUARTER_NAME: string (nullable = true)
 |-- D_HOLIDAY: string (nullable = true)
 |-- D_WEEKEND: string (nullable = true)
 |-- D_FOLLOWING_HOLIDAY: string (nullable = true)
 |-- D_FIRST_DOM: decimal(38,0) (nullable = true)
 |-- D_LAST_DOM: decimal(38,0) (nullable = true)
 |-- D_SAME_DAY_LY: d

In [6]:
# Calculate Revenue and Net Profit

# Join sales data with date_dim to get the year
# store_sales = store_sales.join(date_dim, store_sales["ss_sold_date_sk"] == date_dim["d_date_sk"])
# web_sales = web_sales.join(date_dim, web_sales["ws_sold_date_sk"] == date_dim["d_date_sk"])
# catalog_sales = catalog_sales.join(date_dim, catalog_sales["cs_sold_date_sk"] == date_dim["d_date_sk"])

# Alias the DataFrames to avoid ambiguity
store_sales_alias = store_sales.alias("ss")
web_sales_alias = web_sales.alias("ws")
catalog_sales_alias = catalog_sales.alias("cs")
date_dim_alias = date_dim.alias("dd")

# Perform the join on store_sales with date_dim
store_sales = store_sales_alias.join(
    date_dim_alias,
    store_sales_alias["ss_sold_date_sk"] == date_dim_alias["d_date_sk"],
    "inner"
).select("ss.*", "dd.d_year")

# Perform the join on web_sales with date_dim
web_sales = web_sales_alias.join(
    date_dim_alias,
    web_sales_alias["ws_sold_date_sk"] == date_dim_alias["d_date_sk"],
    "inner"
).select("ws.*", "dd.d_year")

# Perform the join on catalog_sales with date_dim
catalog_sales = catalog_sales_alias.join(
    date_dim_alias,
    catalog_sales_alias["cs_sold_date_sk"] == date_dim_alias["d_date_sk"],
    "inner"
).select("cs.*", "dd.d_year")

store_sales = store_sales.repartition(col("d_year"))
web_sales = web_sales.repartition(col("d_year"))
catalog_sales = catalog_sales.repartition(col("d_year"))


# Calculate yearly revenue and net profit
store_revenue = store_sales.groupBy(col("dd.d_year").alias("year")).agg(
    spark_sum("ss_sales_price").alias("store_revenue"),
    spark_sum("ss_net_profit").alias("store_net_profit")
)

web_revenue = web_sales.groupBy(col("dd.d_year").alias("year")).agg(
    spark_sum("ws_sales_price").alias("web_revenue"),
    spark_sum("ws_net_profit").alias("web_net_profit")
)

catalog_revenue = catalog_sales.groupBy(col("dd.d_year").alias("year")).agg(
    spark_sum("cs_sales_price").alias("catalog_revenue"),
    spark_sum("cs_net_profit").alias("catalog_net_profit")
)

# Combine all revenue and net profit data
total_revenue = store_revenue.join(web_revenue, "year").join(catalog_revenue, "year")
total_revenue = total_revenue.withColumn("total_revenue", 
    total_revenue["store_revenue"] + total_revenue["web_revenue"] + total_revenue["catalog_revenue"])
total_revenue = total_revenue.withColumn("total_net_profit", 
    total_revenue["store_net_profit"] + total_revenue["web_net_profit"] + total_revenue["catalog_net_profit"])


In [7]:
# Define a window specification  truncate concatenate
window_spec = Window.partitionBy("year").orderBy("year")

# Calculate the previous year's revenue and net profit
# total_revenue = total_revenue.withColumn("prev_year_revenue", lag("total_revenue").over(window_spec))
total_revenue = total_revenue.withColumn("prev_year_net_profit", lag("total_net_profit").over(window_spec))

# Calculate the growth rates
# total_revenue = total_revenue.withColumn("revenue_growth_rate", 
#    (col("total_revenue") - col("prev_year_revenue")) / col("prev_year_revenue") * 100)
total_revenue = total_revenue.withColumn("net_profit_growth_rate", 
    (col("total_net_profit") - col("prev_year_net_profit")) / col("prev_year_net_profit") * 100)

# Show the results
# total_revenue.select("year", "total_revenue", "total_net_profit", "revenue_growth_rate", "net_profit_growth_rate").show()
total_revenue.select("year", "total_revenue", "total_net_profit", "net_profit_growth_rate").show()

24/10/21 13:58:31 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 13:58:32 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 13:58:33 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 13:58:34 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 13:58:35 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 13:58:37 WARN DefaultJDBCWrapper$: JDBC 3.13.33 is being used. But the certified JDBC version 3.13.14 is recommended.
24/10/21 14:01:38 ERROR SnowflakeChunkDownloader: downloader encountered error: Max retry reached for the download of #chunk0 (Total chunks: 2) retry=10, error=net.snowflake.client.jdbc.SnowflakeSQLLoggedException: JDBC driver internal er

Py4JJavaError: An error occurred while calling o238.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 8 in stage 7.0 failed 1 times, most recent failure: Lost task 8.0 in stage 7.0 (TID 15) (192.168.68.56 executor driver): java.lang.NullPointerException
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryService$TELEMETRY_SERVER_DEPLOYMENT.access$000(TelemetryService.java:270)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryService.getServerDeploymentName(TelemetryService.java:309)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryEvent$Builder.<init>(TelemetryEvent.java:126)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryEvent$LogBuilder.<init>(TelemetryEvent.java:61)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.sendOutOfBandTelemetryMessage(SnowflakeSQLLoggedException.java:53)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.sendTelemetryData(SnowflakeSQLLoggedException.java:220)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.<init>(SnowflakeSQLLoggedException.java:245)
	at net.snowflake.client.jdbc.SnowflakeChunkDownloader.getNextChunkToConsume(SnowflakeChunkDownloader.java:599)
	at net.snowflake.client.core.SFArrowResultSet.fetchNextRowUnsorted(SFArrowResultSet.java:232)
	at net.snowflake.client.core.SFArrowResultSet.fetchNextRow(SFArrowResultSet.java:209)
	at net.snowflake.client.core.SFArrowResultSet.next(SFArrowResultSet.java:344)
	at net.snowflake.client.jdbc.SnowflakeResultSetV1.next(SnowflakeResultSetV1.java:92)
	at net.snowflake.spark.snowflake.io.ResultIterator.hasNext(SnowflakeResultSetRDD.scala:152)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.lang.NullPointerException
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryService$TELEMETRY_SERVER_DEPLOYMENT.access$000(TelemetryService.java:270)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryService.getServerDeploymentName(TelemetryService.java:309)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryEvent$Builder.<init>(TelemetryEvent.java:126)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryEvent$LogBuilder.<init>(TelemetryEvent.java:61)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.sendOutOfBandTelemetryMessage(SnowflakeSQLLoggedException.java:53)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.sendTelemetryData(SnowflakeSQLLoggedException.java:220)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.<init>(SnowflakeSQLLoggedException.java:245)
	at net.snowflake.client.jdbc.SnowflakeChunkDownloader.getNextChunkToConsume(SnowflakeChunkDownloader.java:599)
	at net.snowflake.client.core.SFArrowResultSet.fetchNextRowUnsorted(SFArrowResultSet.java:232)
	at net.snowflake.client.core.SFArrowResultSet.fetchNextRow(SFArrowResultSet.java:209)
	at net.snowflake.client.core.SFArrowResultSet.next(SFArrowResultSet.java:344)
	at net.snowflake.client.jdbc.SnowflakeResultSetV1.next(SnowflakeResultSetV1.java:92)
	at net.snowflake.spark.snowflake.io.ResultIterator.hasNext(SnowflakeResultSetRDD.scala:152)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)


24/10/21 14:01:45 WARN TaskSetManager: Lost task 2.0 in stage 7.0 (TID 9) (192.168.68.56 executor driver): TaskKilled (Stage cancelled: Job aborted due to stage failure: Task 8 in stage 7.0 failed 1 times, most recent failure: Lost task 8.0 in stage 7.0 (TID 15) (192.168.68.56 executor driver): java.lang.NullPointerException
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryService$TELEMETRY_SERVER_DEPLOYMENT.access$000(TelemetryService.java:270)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryService.getServerDeploymentName(TelemetryService.java:309)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryEvent$Builder.<init>(TelemetryEvent.java:126)
	at net.snowflake.client.jdbc.telemetryOOB.TelemetryEvent$LogBuilder.<init>(TelemetryEvent.java:61)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.sendOutOfBandTelemetryMessage(SnowflakeSQLLoggedException.java:53)
	at net.snowflake.client.jdbc.SnowflakeSQLLoggedException.sendTelemetryData(SnowflakeSQLLoggedException.java:220)

In [None]:
import matplotlib.pyplot as plt

# Convert the Spark DataFrame to a Pandas DataFrame for visualization
total_revenue_pd = total_revenue.toPandas()

# Plot revenue growth rate
plt.figure(figsize=(14, 7))
plt.plot(total_revenue_pd['year'], total_revenue_pd['revenue_growth_rate'], marker='o', label='Revenue Growth Rate')
plt.plot(total_revenue_pd['year'], total_revenue_pd['net_profit_growth_rate'], marker='o', label='Net Profit Growth Rate')
plt.xlabel('Year')
plt.ylabel('Growth Rate (%)')
plt.title('Revenue and Net Profit Growth Rates Over the Years')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Stop the Spark session
spark.stop()