In [0]:
from pyspark.sql.functions import avg
from pyspark.sql.window import Window


In [0]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("stockanalysis").getOrCreate()


data = [
("AAPL", "2023-08-26 09:00:00", 150.0),
("AAPL", "2023-08-26 09:30:00", 152.0),
("AAPL", "2023-08-26 10:00:00", 153.5),
("AAPL", "2023-08-26 10:30:00", 155.0),
("AAPL", "2023-08-26 11:00:00", 154.5),
("AAPL", "2023-08-26 11:30:00", 156.0),
("AAPL", "2023-08-26 12:00:00", 157.0),
("AAPL", "2023-08-26 12:30:00", 158.0),
("AAPL", "2023-08-26 13:00:00", 158.5),
("AAPL", "2023-08-26 13:30:00", 160.0),
("AAPL", "2023-08-26 14:00:00", 159.0),
]
schema = ["symbol", "timestamp", "price"]

df=spark.createDataFrame(data,schema)
df.show()


+------+-------------------+-----+
|symbol|          timestamp|price|
+------+-------------------+-----+
|  AAPL|2023-08-26 09:00:00|150.0|
|  AAPL|2023-08-26 09:30:00|152.0|
|  AAPL|2023-08-26 10:00:00|153.5|
|  AAPL|2023-08-26 10:30:00|155.0|
|  AAPL|2023-08-26 11:00:00|154.5|
|  AAPL|2023-08-26 11:30:00|156.0|
|  AAPL|2023-08-26 12:00:00|157.0|
|  AAPL|2023-08-26 12:30:00|158.0|
|  AAPL|2023-08-26 13:00:00|158.5|
|  AAPL|2023-08-26 13:30:00|160.0|
|  AAPL|2023-08-26 14:00:00|159.0|
+------+-------------------+-----+



<h3>convert timestamp from sting to timestamp type</h3>

In [0]:
df=df.withColumn("timestamp",df.timestamp.cast("timestamp"))
df.show()

+------+-------------------+-----+
|symbol|          timestamp|price|
+------+-------------------+-----+
|  AAPL|2023-08-26 09:00:00|150.0|
|  AAPL|2023-08-26 09:30:00|152.0|
|  AAPL|2023-08-26 10:00:00|153.5|
|  AAPL|2023-08-26 10:30:00|155.0|
|  AAPL|2023-08-26 11:00:00|154.5|
|  AAPL|2023-08-26 11:30:00|156.0|
|  AAPL|2023-08-26 12:00:00|157.0|
|  AAPL|2023-08-26 12:30:00|158.0|
|  AAPL|2023-08-26 13:00:00|158.5|
|  AAPL|2023-08-26 13:30:00|160.0|
|  AAPL|2023-08-26 14:00:00|159.0|
+------+-------------------+-----+



<h3>calculate the rolling avg price within 3 hour </h3>

In [0]:
win=Window.partitionBy("symbol").orderBy("timestamp").rowsBetween(-3,0)
df.withColumn("roling_avg_price",avg("price").over(win)).show()

+------+-------------------+-----+------------------+
|symbol|          timestamp|price|  roling_avg_price|
+------+-------------------+-----+------------------+
|  AAPL|2023-08-26 09:00:00|150.0|             150.0|
|  AAPL|2023-08-26 09:30:00|152.0|             151.0|
|  AAPL|2023-08-26 10:00:00|153.5|151.83333333333334|
|  AAPL|2023-08-26 10:30:00|155.0|           152.625|
|  AAPL|2023-08-26 11:00:00|154.5|            153.75|
|  AAPL|2023-08-26 11:30:00|156.0|            154.75|
|  AAPL|2023-08-26 12:00:00|157.0|           155.625|
|  AAPL|2023-08-26 12:30:00|158.0|           156.375|
|  AAPL|2023-08-26 13:00:00|158.5|           157.375|
|  AAPL|2023-08-26 13:30:00|160.0|           158.375|
|  AAPL|2023-08-26 14:00:00|159.0|           158.875|
+------+-------------------+-----+------------------+

