In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/itv015970/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
order_schema = "order_id long, order_date string, customer_id long, order_status string" 

In [3]:
orders_df = spark.read \
.format("csv") \
.schema(order_schema) \
.load("/public/trendytech/orders/orders_1gb.csv")

In [25]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [26]:
orders_df.rdd.getNumPartitions()

9

In [17]:
orders_df.groupby("order_status").count().write.format("csv").mode("overwrite").save("/user/itv015970/output101")

In [4]:
customers_schema = "customerid long, customer_fname string, customer_lname string, username string, password string, address string, city string, state string, pincode long"

In [5]:
customers_df = spark.read \
.format("csv") \
.schema(customers_schema) \
.load("/public/trendytech/retail_db/customers")

In [6]:
customers_df.show()

+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|         1|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|         2|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|         3|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|         4|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|         5|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|         6|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ| 

In [30]:
customers_df.rdd.getNumPartitions()

1

In [37]:
spark.conf.get("spark.sql.autoBroadcastJoinThreshold")

# here broadcast join is enabled which when joing any one table which is less than 10mb then it will go for an broadcast join

'10485760b'

In [38]:
10485760/(1024*1024)

10.0

In [39]:
# disable broadcast join
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [40]:
orders_df.join(customers_df, orders_df.customer_id == customers_df.customerid, "inner").write.format("noop").mode("overwrite").save()