# Computation

In [49]:
#import libraries cell

import json
import time
import findspark
import numpy as np
from numpy import arange
from numpy import linspace 
import pandas as pd
from kafka import KafkaProducer
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.streaming import StreamingContext
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType
from pyspark.sql.functions import from_json, col, max, min

## Spark setup

In [50]:
!ls $SPARK_HOME/sbin/

decommission-slave.sh		start-worker.sh
decommission-worker.sh		start-workers.sh
slaves.sh			stop-all.sh
spark-config.sh			stop-history-server.sh
spark-daemon.sh			stop-master.sh
spark-daemons.sh		stop-mesos-dispatcher.sh
start-all.sh			stop-mesos-shuffle-service.sh
start-history-server.sh		stop-slave.sh
start-master.sh			stop-slaves.sh
start-mesos-dispatcher.sh	stop-thriftserver.sh
start-mesos-shuffle-service.sh	stop-worker.sh
start-slave.sh			stop-workers.sh
start-slaves.sh			workers.sh
start-thriftserver.sh


In [51]:
!$SPARK_HOME/sbin/stop-all.sh --host localhost --port 7077 --webui-port 8080

pd-slave1: stopping org.apache.spark.deploy.worker.Worker
pd-slave2: stopping org.apache.spark.deploy.worker.Worker
pd-master: stopping org.apache.spark.deploy.worker.Worker
stopping org.apache.spark.deploy.master.Master


In [52]:
!$SPARK_HOME/sbin/start-all.sh --host localhost --port 7077 --webui-port 8080

starting org.apache.spark.deploy.master.Master, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.master.Master-1-mapd-b-gr05-1.out
pd-slave2: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-mapd-b-gr05-3.out
pd-slave1: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-mapd-b-gr05-4.out
pd-master: starting org.apache.spark.deploy.worker.Worker, logging to /usr/local/spark/logs/spark-root-org.apache.spark.deploy.worker.Worker-1-mapd-b-gr05-1.out


In [57]:
sc.stop()
spark.stop()

In [58]:
#initialisation of spark from the packages folder
findspark.init('/usr/local/spark')

In [59]:
#start session - specify port, application name, and configuration settings.

spark = SparkSession.builder \
        .appName("Project_MAPDB_application")\
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
        .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")\
        .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")\
        .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")\
        .getOrCreate()

#default parallelism setting to shuffle different partitions between workers (for join operation).
#spark.conf.set("spark.sql.shuffle.partitions", spark.sparkContext.defaultParallelism) #15 partitions

In [60]:
spark

In [61]:
import pyspark
from pyspark import SparkConf, SparkContext
sc = spark.sparkContext
sc

In [62]:
# python dataset
data = [1,2,3,4,5,6,7,8]

# parallelize
dist_data = sc.parallelize(data)

In [63]:
dist_data.count()

                                                                                

8

In [64]:
sc.parallelize(data, numSlices=8).count()

8

In [65]:
sc.parallelize(data, numSlices=8).getNumPartitions()

8

In [46]:
dist_data.getNumPartitions()

4

In [48]:
dist_data.map(lambda x: x+1).reduce(lambda x, y: x+y)

44

## Kafka Setup

In [6]:
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic

KAFKA_BOOTSTRAP_SERVERS = 'localhost:9092'

producer = KafkaProducer(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)


In [2]:
kafka_admin = KafkaAdminClient(bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)


In [3]:
kafka_admin.delete_topics(['stream', 'results'])


DeleteTopicsResponse_v3(throttle_time_ms=0, topic_error_codes=[(topic='stream', error_code=0), (topic='results', error_code=0)])

In [6]:
kafka_admin.list_topics()

['stream2', 'stream', 'results']

In [5]:
results_topic = NewTopic(name='results', 
                       num_partitions=1, 
                       replication_factor=1)
kafka_admin.create_topics(new_topics=[results_topic])


CreateTopicsResponse_v3(throttle_time_ms=0, topic_errors=[(topic='results', error_code=0, error_message=None)])

In [7]:
kafka_admin.list_topics()

['stream2', 'stream', 'results']