In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    appName('repartition-coalesce'). \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/itv007008/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
spark

In [3]:
### Order Dataset ####
order_base = spark.sparkContext.textFile('/public/trendytech/orders/orders_1gb.csv')

In [4]:
order_base.take(1)

['1,2013-07-25 00:00:00.0,11599,CLOSED']

In [5]:
### Customer Dataset ###
customer_base = spark.sparkContext.textFile('/public/trendytech/retail_db/customers/*')

In [6]:
customer_base.take(1)

['1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521']

In [7]:
order_base.getNumPartitions()

9

In [8]:
repartitioned_orders =  order_base.repartition(15)

In [9]:
repartitioned_orders.getNumPartitions()

15

In [10]:
repartitioned_orders_new = order_base.repartition(4)

In [11]:
repartitioned_orders_new.getNumPartitions()

4

In [12]:
new_rdd = order_base.coalesce(30)

In [13]:
new_rdd.getNumPartitions()

9

In [14]:
new_rdd1 = order_base.coalesce(5)

In [15]:
new_rdd1.getNumPartitions()

5

In [24]:
### Caching ####
orders_filtered = order_base.filter(lambda x: x.split(',')[3] != 'PENDING_PAYMENT')

In [25]:
orders_reduced_1 = orders_filtered.map(lambda x: (x.split(',')[2],1)).reduceByKey(lambda x,y: x+y)

In [26]:
result_1 = orders_reduced_1.filter(lambda x: int(x[0]) < 501)

In [27]:
result_1.collect()

[('448', 1500),
 ('295', 750),
 ('309', 1125),
 ('275', 1500),
 ('326', 2625),
 ('100', 2250),
 ('228', 1875),
 ('212', 1500),
 ('99', 2250),
 ('54', 1875),
 ('124', 1500),
 ('40', 1875),
 ('260', 1500),
 ('476', 375),
 ('202', 2250),
 ('294', 1125),
 ('441', 1125),
 ('2', 1125),
 ('285', 2625),
 ('289', 2250),
 ('486', 2625),
 ('446', 1500),
 ('225', 1875),
 ('33', 750),
 ('22', 1875),
 ('118', 1500),
 ('134', 1500),
 ('424', 2625),
 ('247', 2625),
 ('293', 750),
 ('184', 1875),
 ('199', 1500),
 ('328', 2250),
 ('352', 2250),
 ('178', 375),
 ('97', 1875),
 ('348', 1500),
 ('256', 2625),
 ('389', 1125),
 ('452', 1500),
 ('281', 1125),
 ('53', 1125),
 ('227', 1500),
 ('218', 1125),
 ('498', 1500),
 ('31', 1125),
 ('315', 1125),
 ('291', 1500),
 ('238', 2250),
 ('43', 750),
 ('358', 1125),
 ('151', 1500),
 ('171', 750),
 ('91', 1875),
 ('30', 375),
 ('106', 750),
 ('173', 3000),
 ('93', 1875),
 ('332', 2250),
 ('327', 1875),
 ('208', 1500),
 ('464', 1875),
 ('229', 2250),
 ('321', 3000),

In [29]:
result_1.collect()

[('289', 2250),
 ('486', 2625),
 ('446', 1500),
 ('225', 1875),
 ('54', 1875),
 ('33', 750),
 ('22', 1875),
 ('118', 1500),
 ('134', 1500),
 ('424', 2625),
 ('247', 2625),
 ('202', 2250),
 ('441', 1125),
 ('293', 750),
 ('326', 2625),
 ('294', 1125),
 ('184', 1875),
 ('199', 1500),
 ('99', 2250),
 ('100', 2250),
 ('328', 2250),
 ('352', 2250),
 ('178', 375),
 ('97', 1875),
 ('348', 1500),
 ('256', 2625),
 ('389', 1125),
 ('452', 1500),
 ('281', 1125),
 ('53', 1125),
 ('124', 1500),
 ('275', 1500),
 ('227', 1500),
 ('285', 2625),
 ('218', 1125),
 ('498', 1500),
 ('31', 1125),
 ('315', 1125),
 ('228', 1875),
 ('40', 1875),
 ('448', 1500),
 ('291', 1500),
 ('238', 2250),
 ('212', 1500),
 ('43', 750),
 ('358', 1125),
 ('151', 1500),
 ('171', 750),
 ('91', 1875),
 ('2', 1125),
 ('309', 1125),
 ('30', 375),
 ('260', 1500),
 ('106', 750),
 ('295', 750),
 ('476', 375),
 ('261', 750),
 ('257', 2625),
 ('329', 2250),
 ('231', 2625),
 ('465', 2625),
 ('435', 1875),
 ('84', 750),
 ('359', 1500),
 

In [30]:
result_1.cache()

PythonRDD[29] at collect at <ipython-input-27-1c2943c009f8>:1

In [31]:
result_1.collect()

[('348', 1500),
 ('247', 2625),
 ('91', 1875),
 ('184', 1875),
 ('424', 2625),
 ('446', 1500),
 ('151', 1500),
 ('256', 2625),
 ('199', 1500),
 ('389', 1125),
 ('352', 2250),
 ('100', 2250),
 ('291', 1500),
 ('227', 1500),
 ('328', 2250),
 ('238', 2250),
 ('486', 2625),
 ('358', 1125),
 ('289', 2250),
 ('2', 1125),
 ('260', 1500),
 ('498', 1500),
 ('202', 2250),
 ('97', 1875),
 ('40', 1875),
 ('285', 2625),
 ('124', 1500),
 ('326', 2625),
 ('452', 1500),
 ('43', 750),
 ('134', 1500),
 ('448', 1500),
 ('295', 750),
 ('309', 1125),
 ('275', 1500),
 ('228', 1875),
 ('212', 1500),
 ('99', 2250),
 ('54', 1875),
 ('476', 375),
 ('294', 1125),
 ('441', 1125),
 ('225', 1875),
 ('33', 750),
 ('22', 1875),
 ('118', 1500),
 ('293', 750),
 ('178', 375),
 ('281', 1125),
 ('53', 1125),
 ('218', 1125),
 ('31', 1125),
 ('315', 1125),
 ('171', 750),
 ('30', 375),
 ('106', 750),
 ('173', 3000),
 ('457', 2625),
 ('460', 1875),
 ('444', 1875),
 ('48', 3000),
 ('464', 1875),
 ('478', 2250),
 ('109', 1875),

In [32]:
spark.stop()