# Connect to Local Spark Cluster using Spark Context (running via Docker)

In [7]:
%set_env JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home

env: JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home


In [None]:
%env

In [None]:
# https://elmiko.github.io/2018/08/05/attaching-notebooks-with-radanalytics.html
import pyspark
conf=pyspark.SparkConf().setMaster('spark://127.0.0.1:7077')
#conf=pyspark.SparkConf().setMaster('local')
sc = pyspark.SparkContext(conf=conf)

In [None]:
sc

## Program to calculate Pi on Spark Cluster

In [None]:
%%time
import random
num_samples = 100000000

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples

print(pi)

# Connect to Local Spark Cluster using Spark Session (running via Docker)

In [1]:
%set_env JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home

env: JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home


In [2]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format

In [3]:
config = configparser.ConfigParser()
config.read('/Users/akshayiyer/Dev/GitHub/udacity-dend/udacity-dend-capstone-etl/.aws/access_keys.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config.get('AWS','KEY')
os.environ['AWS_SECRET_ACCESS_KEY']=config.get('AWS','SECRET')

In [4]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .master('spark://127.0.0.1:7077') \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0")\
        .getOrCreate() 
    return spark

In [5]:
spark = create_spark_session()

## Read JSON file into Spark dataframe 

In [None]:
song_data_file_path = 's3a://udacity-dend/song_data/A/B/C/TRABCEI128F424C983.json'

song_df = spark.read.json(song_data_file_path)

In [None]:
song_df.printSchema()

# Testing Delta Lake

In [9]:
%set_env JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home

env: JAVA_HOME=/Users/akshayiyer/Library/Java/JavaVirtualMachines/jdk8u222-b10/Contents/Home


In [10]:
from pyspark.sql import SparkSession

In [11]:
def create_spark_session(master):
    spark = SparkSession \
        .builder \
        .master(master) \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0,io.delta:delta-core_2.11:0.3.0")\
        .appName("test-delta-lake") \
        .getOrCreate() 
    return spark

In [12]:
spark = create_spark_session("spark://127.0.0.1:7077")

In [13]:
spark

In [17]:
filepath = "/Users/akshayiyer/Dev/GitHub/udacity-dend-capstone-etl/data/delta-table"

In [15]:
data = spark.range(0, 5)
data.write.format("delta").save(filepath)

In [18]:
df = spark.read.format("delta").load(filepath)
df.show()

+---+
| id|
+---+
|  0|
|  3|
|  1|
|  4|
|  2|
+---+



In [19]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save(filepath)

In [20]:
df = spark.read.format("delta").option("versionAsOf", 0).load(filepath)
df.show()

+---+
| id|
+---+
|  0|
|  3|
|  1|
|  4|
|  2|
+---+



In [22]:
df = spark.read.format("delta").option("versionAsOf", 1).load(filepath)
df.show()

+---+
| id|
+---+
|  7|
|  5|
|  6|
|  8|
|  9|
+---+

