In [None]:
! ssh -i ~/Downloads/practive.pem ubuntu@ec2-54-67-13-242.us-west-1.compute.amazonaws.com


#  Spark Setup ######################


In [None]:
! sudo apt install openjdk-11-jre-headless
! export JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64"
! sudo apt-get update
! sudo apt-get upgrade
! sudo apt install python3-pip
! pip install pyspark


## Spark Verification

In [None]:
! python3

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.functions import collect_list
from pyspark.sql.functions import explode, create_map
from pyspark.sql.functions import size
from pyspark.sql.types import IntegerType, MapType
from pyspark.sql.types import StringType
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = ",".join([
    '--packages net.snowflake:spark-snowflake_2.12:2.12.0-spark_3.4',
    'net.snowflake:snowflake-jdbc:3.14.0 pyspark-shell'])

spark = SparkSession.builder.appName("example").getOrCreate()

simple_data = spark.sparkContext.parallelize([[1, "Alice", 50]]).toDF()
simple_data.count()

@udf(StringType())
def test_udf(s): return s + "_test_udf"

simple_data.withColumn("test", test_udf(col("_2"))).show()    


# Kafka Setup ######################


In [None]:
! wget https://archive.apache.org/dist/kafka/3.5.0/kafka_2.12-3.5.0.tgz
! tar -zxvf kafka_2.12-3.5.0.tgz
! cd kafka_2.12-3.5.0
! ./bin/zookeeper-server-start.sh config/zookeeper.properties &
! sleep 5
! ./bin/kafka-server-start.sh config/server.properties &
! pip uninstall kafka-python

## Kafka Verification

In [None]:

##### Producer ########
from kafka import KafkaProducer
import datetime

producer = KafkaProducer(bootstrap_servers='localhost:9092')
def publish(topic, message):
    print(f"{datetime.datetime.now()} publishing to {topic = }, {message = }")
    producer.send(topic, bytes(message, encoding='utf-8'))

publish("test", "test_message1")
publish("test", "test_message2")

##### Consumer ########

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

os.environ['PYSPARK_SUBMIT_ARGS'] = ",".join([
    '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.0',
    'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0 pyspark-shell'])


spark = SparkSession.builder.appName("Read Kafka").config('spark.jars.packages', ','.join(['org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0','org.apache.spark:spark-streaming-kafka-0-10_2.12:3.5.0'])).getOrCreate()

df = spark.read.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("startingOffsets", "earliest").option("subscribe", "test").load().withColumn("value", col("value").cast("string"))

df.show()

# Airflow

In [None]:
! sudo apt install sqlite3

! sudo apt-get install libpq-dev

! sudo pip install "apache-airflow[postgres]==2.5.0" --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.5.0/constraints-3.7.txt"

! airflow db init

! sudo apt-get install postgresql postgresql-contrib

! sudo -i -u postgres

! psql

! CREATE DATABASE airflow;
! CREATE USER airflow WITH PASSWORD 'airflow';
! GRANT ALL PRIVILEGES ON DATABASE airflow TO airflow;

! exit

! cd airflow

! sed -i 's#sqlite:////home/ubuntu/airflow/airflow.db#postgresql+psycopg2://airflow:airflow@localhost/airflow#g' airflow.cfg

! sed -i 's#SequentialExecutor#LocalExecutor#g' airflow.cfg

! airflow db init

! airflow users create -u airflow -f airflow -l airflow -r Admin -e airflow@gmail.com

! airflow webserver &

! airflow scheduler