## Explore Stock Prices using Spark SQL

In [29]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
import pandas 

In [24]:
#initializing SparkContext

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

### Load data into Spark

In [27]:
tesla_file = 'TSLA.csv'
tesla_rdd = sc.textFile(tesla_file)

amazon_file = 'AMZN.csv'
google_file = 'GOOG.csv'

### RDD

In [6]:
tesla_rdd.take(5)

['Date,Open,High,Low,Close,Adj Close,Volume',
 '2020-02-10,160.000000,163.998001,150.479996,154.255997,154.255997,123446000',
 '2020-02-11,153.757996,156.701996,151.600006,154.876007,154.876007,58487500',
 '2020-02-12,155.574005,157.949997,152.673996,153.457993,153.457993,60112500',
 '2020-02-13,148.367996,163.600006,147.000000,160.800003,160.800003,131446500']

In [7]:
csv_rdd = tesla_rdd.map(lambda row: row.split(","))

In [20]:
csv_rdd.take(2)

[['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'],
 ['2020-02-10',
  '160.000000',
  '163.998001',
  '150.479996',
  '154.255997',
  '154.255997',
  '123446000']]

### DataFrame

In [11]:
tesla_df = csv_rdd.toDF(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])

In [15]:
tesla_df.show(5)

+----------+----------+----------+----------+----------+----------+---------+
|      Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
+----------+----------+----------+----------+----------+----------+---------+
|      Date|      Open|      High|       Low|     Close| Adj Close|   Volume|
|2020-02-10|160.000000|163.998001|150.479996|154.255997|154.255997|123446000|
|2020-02-11|153.757996|156.701996|151.600006|154.876007|154.876007| 58487500|
|2020-02-12|155.574005|157.949997|152.673996|153.457993|153.457993| 60112500|
|2020-02-13|148.367996|163.600006|147.000000|160.800003|160.800003|131446500|
+----------+----------+----------+----------+----------+----------+---------+
only showing top 5 rows



In [14]:
amazon_df = sqlContext.read.load(amazon_file,
                                format='com.databricks.spark.csv',
                                header='true',
                                inferSchema='true')

In [16]:
amazon_df.show(5)

+----------+-----------+-----------+-----------+-----------+-----------+-------+
|      Date|       Open|       High|        Low|      Close|  Adj Close| Volume|
+----------+-----------+-----------+-----------+-----------+-----------+-------+
|2020-02-10| 2085.01001|2135.600098|2084.959961|2133.909912|2133.909912|5056200|
|2020-02-11|2150.899902|2185.949951|     2136.0|2150.800049|2150.800049|5746000|
|2020-02-12|2163.199951|    2180.25|2155.290039|     2160.0|     2160.0|3334300|
|2020-02-13| 2144.98999|2170.280029|     2142.0|2149.870117|2149.870117|3031800|
|2020-02-14|2155.679932|2159.040039|2125.889893|2134.870117|2134.870117|2606200|
+----------+-----------+-----------+-----------+-----------+-----------+-------+
only showing top 5 rows



In [21]:
amazon_df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [25]:
amazon_df.count()

253

In [26]:
amazon_df.toPandas().head(5)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-02-10,2085.01001,2135.600098,2084.959961,2133.909912,2133.909912,5056200
1,2020-02-11,2150.899902,2185.949951,2136.0,2150.800049,2150.800049,5746000
2,2020-02-12,2163.199951,2180.25,2155.290039,2160.0,2160.0,3334300
3,2020-02-13,2144.98999,2170.280029,2142.0,2149.870117,2149.870117,3031800
4,2020-02-14,2155.679932,2159.040039,2125.889893,2134.870117,2134.870117,2606200


### Explore and Query Data

#### DataFrames operations

In [28]:
google_df = sqlContext.read.load(google_file,
                                format='com.databricks.spark.csv',
                                header='true',
                                inferSchema='true')

In [32]:
google_df.select(F.year('Date').alias('yr'), 'Close').groupby('yr').avg('Close').sort('yr').show()

+----+------------------+
|  yr|        avg(Close)|
+----+------------------+
|2020|1485.8534564933911|
|2021|1869.0388558461536|
+----+------------------+



In [34]:
amazon_df.select(F.year('Date').alias('year'),
                F.month('Date').alias('month'),
                'Low',)\
         .groupby('year', 'month').avg('Low').sort('year', 'month')\
         .show()

+----+-----+------------------+
|year|month|          avg(Low)|
+----+-----+------------------+
|2020|    2| 2053.253574928571|
|2020|    3|1825.1590964090908|
|2020|    4|2185.9347679523808|
|2020|    5|     2364.48549805|
|2020|    6| 2579.099553909091|
|2020|    7|2991.4899903636356|
|2020|    8|3207.5033250476195|
|2020|    9|3109.7995023809526|
|2020|   10| 3185.981367681817|
|2020|   11|3102.2170043999995|
|2020|   12|3166.3018133636365|
|2021|    1|3171.6500051052626|
|2021|    2|3298.1542968571425|
+----+-----+------------------+

