# Dates and Timestamps

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("dates").getOrCreate()

In [None]:
df = spark.read.csv("../datasets/appl_stock.csv",header=True,inferSchema=True)

In [None]:
df.show(5, truncate=False)

Let's walk through how to grab parts of the timestamp data

In [None]:
from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format

In [None]:
df.select(dayofmonth(df['Date'])).show()

In [None]:
df.select(hour(df['Date'])).show()

In [None]:
df.select(dayofyear(df['Date'])).show()

In [None]:
df.select(month(df['Date'])).show()

So for example, let's say we wanted to know the average closing price per year. Easy! With a groupby and the year() function call:

In [None]:
df.select(year(df['Date'])).show(5, truncate=False)

In [None]:
df.withColumn("Year",year(df['Date'])).show(5, truncate=False)

In [None]:
newdf = df.withColumn("Year",year(df['Date']))

In [None]:
newdf.show(5, truncate=False)

In [None]:
import pyspark.sql.functions as F

In [None]:
newdf.select(F.current_date().alias("current_date"), F.current_timestamp().alias("current_timestamp")).show(truncate=False) # YYYY-mm-dd # YYYY-mm-dd HH:MM:SS.ssssss

In [None]:
max_high = newdf.select(F.max("High").alias("high"), F.min("High").alias("low")).collect()

In [None]:
type(max_high)

In [None]:
max_high[0]

In [None]:
max_high[0]["high"], max_high[0]["low"]

In [None]:
newdf.select("year").distinct().sort("year").show()

In [None]:
columns = newdf.columns

In [None]:
needed_columns = columns[:4]

In [None]:
needed_columns

In [None]:
newdf.select(*needed_columns).show()

In [None]:
newdf.select(newdf.columns).show()

In [None]:
df1 = spark.createDataFrame([('2022-08-16',)], ['dt'])

In [None]:
df1.show()

In [None]:
df1.select(date_format('dt', 'MM/dd/yyyy').alias('date')).collect()

In [None]:
newdf.groupBy("Year").max()[["Year", "max(High)"]].show(truncate=False)

In [None]:
newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].show()

In [None]:
result = newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']]

In [None]:
result = result.withColumnRenamed("avg(Year)","Year")

In [None]:
result = result.select('Year',format_number('avg(Close)',2).alias("Mean Close")).show()

In [None]:
spark.stop()