# DataFrmaes

## Dates and Timestamps

In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7/')

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('time').getOrCreate()

In [4]:
df = spark.read.csv('/home/ubuntu/DataScience/pySpark/Spark_DataFrames/appl_stock.csv', inferSchema=True, header=True)

In [5]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [7]:
df.head(1)

[Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)]

In [9]:
df.select(['Date','Open']).show()

+--------------------+------------------+
|                Date|              Open|
+--------------------+------------------+
|2010-01-04 00:00:...|        213.429998|
|2010-01-05 00:00:...|        214.599998|
|2010-01-06 00:00:...|        214.379993|
|2010-01-07 00:00:...|            211.75|
|2010-01-08 00:00:...|        210.299994|
|2010-01-11 00:00:...|212.79999700000002|
|2010-01-12 00:00:...|209.18999499999998|
|2010-01-13 00:00:...|        207.870005|
|2010-01-14 00:00:...|210.11000299999998|
|2010-01-15 00:00:...|210.92999500000002|
|2010-01-19 00:00:...|        208.330002|
|2010-01-20 00:00:...|        214.910006|
|2010-01-21 00:00:...|        212.079994|
|2010-01-22 00:00:...|206.78000600000001|
|2010-01-25 00:00:...|202.51000200000001|
|2010-01-26 00:00:...|205.95000100000001|
|2010-01-27 00:00:...|        206.849995|
|2010-01-28 00:00:...|        204.930004|
|2010-01-29 00:00:...|        201.079996|
|2010-02-01 00:00:...|192.36999699999998|
+--------------------+------------

### Extract Date information

In [13]:
from pyspark.sql.functions import (dayofmonth,
                                   hour,dayofyear,
                                   month,year,
                                  format_number, date_format)

In [14]:
df.select(dayofmonth(df['Date'])).show()

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
|              11|
|              12|
|              13|
|              14|
|              15|
|              19|
|              20|
|              21|
|              22|
|              25|
|              26|
|              27|
|              28|
|              29|
|               1|
+----------------+
only showing top 20 rows



In [15]:
df.select(hour(df['Date'])).show()

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



In [16]:
df.select(month(df['Date'])).show()

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          2|
+-----------+
only showing top 20 rows



In [20]:
#df.select(year(df['Date'])).show()
#df.withColumn("Year",year(df['Date'])).show()
newdf = df.withColumn("Year",year(df['Date']))

In [22]:
newdf.groupBy("Year").mean().show()

+----+------------------+------------------+------------------+------------------+--------------------+------------------+---------+
|Year|         avg(Open)|         avg(High)|          avg(Low)|        avg(Close)|         avg(Volume)|    avg(Adj Close)|avg(Year)|
+----+------------------+------------------+------------------+------------------+--------------------+------------------+---------+
|2015|120.17575393253965|121.24452385714291| 118.8630954325397|120.03999980555547|  5.18378869047619E7|115.96740080555561|   2015.0|
|2013| 473.1281355634922| 477.6389272301587|468.24710264682557| 472.6348802857143|          1.016087E8| 62.61798788492063|   2013.0|
|2014| 295.1426195357143|297.56103184523823| 292.9949599801587| 295.4023416507935| 6.315273055555555E7| 87.63583323809523|   2014.0|
|2012|     576.652720788| 581.8254008040001| 569.9211606079999| 576.0497195640002|       1.319642044E8| 74.81383696800002|   2012.0|
|2016|104.50777772619044| 105.4271825436508|103.69027771825397|104.60

In [23]:
newdf.groupBy("Year").mean().select(["Year","avg(Close)"]).show()

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
|2010| 259.8424600000002|
|2011|364.00432532142867|
+----+------------------+



In [24]:
result = newdf.groupBy("Year").mean().select(["Year","avg(Close)"])

In [26]:
new = result.withColumnRenamed("avg(Close)","Average Closing Price")
new.show()

+----+---------------------+
|Year|Average Closing Price|
+----+---------------------+
|2015|   120.03999980555547|
|2013|    472.6348802857143|
|2014|    295.4023416507935|
|2012|    576.0497195640002|
|2016|   104.60400786904763|
|2010|    259.8424600000002|
|2011|   364.00432532142867|
+----+---------------------+



In [29]:
new.select(['Year',format_number("Average Closing Price",2).alias("Avg Close")]).show()

+----+---------+
|Year|Avg Close|
+----+---------+
|2015|   120.04|
|2013|   472.63|
|2014|   295.40|
|2012|   576.05|
|2016|   104.60|
|2010|   259.84|
|2011|   364.00|
+----+---------+

