# Spark DataFrame

## Dates and Timestamps

In [1]:
import findspark
findspark.init('C:\Spark\spark-3.0.1-bin-hadoop2.7')
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('dates').getOrCreate()

In [3]:
df = spark.read.csv('appl_stock.csv', 
                    header=True, 
                    inferSchema=True)

In [4]:
df.select(['Date', 'Open']).show(5)

+----------+----------+
|      Date|      Open|
+----------+----------+
|2010-01-04|213.429998|
|2010-01-05|214.599998|
|2010-01-06|214.379993|
|2010-01-07|    211.75|
|2010-01-08|210.299994|
+----------+----------+
only showing top 5 rows



In [5]:
from pyspark.sql.functions import (dayofmonth, hour, 
                                   dayofyear, month, 
                                   year, weekofyear, 
                                   format_number, date_format)

In [6]:
df.select(dayofmonth(df['Date'])).show(5)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
+----------------+
only showing top 5 rows



In [7]:
df.select(month(df['Date'])).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



In [8]:
# df.select(year(df['Date']))
newdf = df.withColumn('Year', year(df['Date']))

In [9]:
result = newdf.groupBy('Year').mean().select(
    ['Year', 'avg(Close)']
)

In [10]:
new_result = result.withColumnRenamed('avg(Close)', 'Average Closing Price')

In [11]:
new_result.select(
    ['Year', 
     format_number('Average Closing Price', 2).alias('Avg Close')]
).show()

+----+---------+
|Year|Avg Close|
+----+---------+
|2015|   120.04|
|2013|   472.63|
|2014|   295.40|
|2012|   576.05|
|2016|   104.60|
|2010|   259.84|
|2011|   364.00|
+----+---------+

