In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.sql.functions import col,sum,avg,max

In [0]:
#Creating the schema for the Orders csv files
order_schema = StructType([
    StructField("O_ORDERKEY", IntegerType()),
    StructField("O_CUSTKEY", StringType()),
    StructField("O_ORDERSTATUS", StringType()),
    StructField("O_TOTALPRICE", DoubleType()),
    StructField("O_ORDERDATE", DateType()),
    StructField("O_ORDERPRIORITY", StringType()),
    StructField("O_CLERK", StringType()),
    StructField("O_SHIPPRIORITY", IntegerType()),
    StructField("O_COMMENT", StringType())
])


In [0]:
#Creating the schema for the Customer csv files
cust_schema = StructType([
    StructField("C_CUSTKEY", IntegerType()),
    StructField("C_NAME", StringType()),
    StructField("C_ADDRESS", StringType()),
    StructField("C_NATIONKEY", ShortType()),
    StructField("C_PHONE", StringType()),
    StructField("C_ACCTBAL", DoubleType()),
    StructField("C_MKTSEGMENT", StringType()),
    StructField("C_COMMENT", StringType())
])


In [0]:
#Creating the Orders dataframe by specifying the schema 
df_ord_sch= spark.read.format("csv").option("header",True).schema(order_schema).load("/mnt/Gen2/orders/csvFiles/")

display(df_ord_sch.count())

display(df_ord_sch.limit(10))

1500000

O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
888455,47608,O,156630.33,1998-04-02,1-URGENT,Clerk#000000656,0,furiously special f
1499747,118027,F,364452.69,1995-01-22,2-HIGH,Clerk#000000947,0,furiously special f
1015810,96658,O,301267.87,1996-09-01,3-MEDIUM,Clerk#000000389,0,furiously special f
962149,120001,O,106121.36,1997-11-04,5-LOW,Clerk#000000727,0,furiously special f
851648,46747,F,104150.25,1994-06-11,4-NOT SPECIFIED,Clerk#000000001,0,furiously special f
1105671,139361,F,170629.76,1993-03-06,2-HIGH,Clerk#000000542,0,furiously special f
1144519,140470,O,174847.74,1996-07-29,5-LOW,Clerk#000000194,0,furiously special f
858245,59417,O,164479.31,1996-10-05,1-URGENT,Clerk#000000765,0,furiously special f
869286,68644,O,16571.72,1997-08-20,2-HIGH,Clerk#000000967,0,furiously special f
1245348,34657,O,198960.95,1997-11-01,3-MEDIUM,Clerk#000000706,0,furiously special f


In [0]:
#Creating the dataframe by specifying the schema using .schema option. We don't see any DAG getting created with schema is specified
df_cust_sch= spark.read.format("csv").option("header",True).schema(cust_schema).load("/mnt/Gen2/customer/csvFiles/")
display(df_cust_sch.count())

display(df_cust_sch.limit(10))

150000

C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
35165,Customer#000035165,eNQSvDTld1 f7JmY,0,10-173-541-5438,4767.46,AUTOMOBILE,special excuses. furiously pending packages
30597,Customer#000030597,S9s1dDut8Q,0,10-607-243-5581,-639.62,FURNITURE,lithely ruthless packages alongside of the blithely final p
42279,Customer#000042279,ABcVdNnA3JFB7bK5,0,10-934-981-2863,2236.39,MACHINERY,the even deposits sleep blithely packages. quickly express packages should have to detec
42578,Customer#000042578,l6VNaE7iSZFtkSC5fSuLeaoWTJgx5,0,10-281-998-8028,6429.8,BUILDING,y alongside of the platelets. regular deposits sleep fluffily blithely silent pinto beans: re
37854,Customer#000037854,dL6LCTLpY9hjLTrZ7g,0,10-909-820-4270,9549.78,BUILDING,inder blithely deposits. instructions nag quickly regular packages. regular requests
40053,Customer#000040053,qh8Q6gaffF73cm73K2R,0,10-593-423-2533,209.4,MACHINERY,c pinto beans. special instructions cajole fluffy
44060,Customer#000044060,"XXYMZ4Jd4PY3WJZA5bok4u 7oknfVG,rNobaef",0,10-519-920-9801,1150.68,BUILDING,according to the furiously bold instructions. regular ideas after th
49988,Customer#000049988,"oYYSmhl,K3t AwKzm5FmeEwz5lw hLf7z9m",0,10-704-487-3360,3708.73,BUILDING,"t quickly. pending, special accounts cajole furiously quick pinto beans. fluffily"
43569,Customer#000043569,"ANmTNESWDI17e2pG7j7min2Jm,vHJ",0,10-911-273-3629,3239.41,MACHINERY,"of the slyly even deposits. unusual, even theodolites about the slyly silent accounts bo"
27443,Customer#000027443,xOgk0us699smqWP3US4ufY MhkfbwNdJvCv,0,10-492-101-8357,4304.73,MACHINERY,ly bold accounts at the carefully final ideas nag slyly final accounts. express requests are fluffily


In [0]:
# check for parameter value and the default value is 10 MB
spark.conf.get("spark.sql.autoBroadcastJoinThreshold")

Out[19]: '2194304'

In [0]:
#Setting to 2 MB
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",2194304)

In [0]:
# Check the execution plan and you will find sort merge join being used 
df_ord_sch.join(df_cust_sch, df_ord_sch.O_CUSTKEY == df_cust_sch.C_CUSTKEY, "inner").count()

Out[21]: 1500000

In [0]:
df_ord_cust=df_ord_sch.join(df_cust_sch.hint("broadcast"), df_ord_sch.O_CUSTKEY == df_cust_sch.C_CUSTKEY, "inner")
display(df_ord_cust.limit(10))
display(df_ord_cust.count())

O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
888455,47608,O,156630.33,1998-04-02,1-URGENT,Clerk#000000656,0,furiously special f,47608,Customer#000047608,BHPyeI9EXq7UKqGKKWmYfqPzh Rholwge,0,10-227-343-8646,1795.93,MACHINERY,"posits about the pending pinto beans sleep ironic, final theodolites. ideas are fur"
1499747,118027,F,364452.69,1995-01-22,2-HIGH,Clerk#000000947,0,furiously special f,118027,Customer#000118027,JEGV uSntJj3CBzlThpLvKsBZJ,0,10-418-159-2340,7906.39,BUILDING,lites are quickly. deposits sleep furiously. furiously unusual packages haggle. slyly stealthy platelets sh
1015810,96658,O,301267.87,1996-09-01,3-MEDIUM,Clerk#000000389,0,furiously special f,96658,Customer#000096658,"XQyJM,Z,,QAVN",0,10-660-499-4544,6027.13,FURNITURE,the fluffily final instructions. carefully ironic decoys nod pending requ
962149,120001,O,106121.36,1997-11-04,5-LOW,Clerk#000000727,0,furiously special f,120001,Customer#000120001,7qM24YAWXOutG56bS3o,0,10-512-286-6075,2404.6,HOUSEHOLD,"deposits haggle according to the blithely ruthless requests. carefully unusual foxes are fluffily bold, special re"
851648,46747,F,104150.25,1994-06-11,4-NOT SPECIFIED,Clerk#000000001,0,furiously special f,46747,Customer#000046747,NqcP8hfbE2cTh6bvCR0uS,0,10-281-614-5336,4341.89,BUILDING,furiously final requests would nag closely quickly bold deposits. slyly final deposits about the evenly final depe
1105671,139361,F,170629.76,1993-03-06,2-HIGH,Clerk#000000542,0,furiously special f,139361,Customer#000139361,"rv2GqTu8rs9Wygp1En38yfgQH6,rXklDkpbTRD",0,10-165-989-9143,2936.56,FURNITURE,structions alongside of the furiously ironic requests detect carefull
1144519,140470,O,174847.74,1996-07-29,5-LOW,Clerk#000000194,0,furiously special f,140470,Customer#000140470,QWIcKyQSNPsp8Ar3Vc0P,0,10-372-959-8028,1106.28,BUILDING,ndencies. blithely special requests wake. blithely special dolphins detect slyly. fur
858245,59417,O,164479.31,1996-10-05,1-URGENT,Clerk#000000765,0,furiously special f,59417,Customer#000059417,I2ZE7Q3RzuIeB9nkPrmpVw5ziOZ0YWvSDck,0,10-722-468-3192,7034.06,MACHINERY,uests. idly regular accounts cajole bold requests. pending accounts use. even instructio
869286,68644,O,16571.72,1997-08-20,2-HIGH,Clerk#000000967,0,furiously special f,68644,Customer#000068644,3rRZ9IZP5oz9Yry,0,10-340-779-5010,6499.86,MACHINERY,"inst the regular ideas. bold, special deposits about t"
1245348,34657,O,198960.95,1997-11-01,3-MEDIUM,Clerk#000000706,0,furiously special f,34657,Customer#000034657,w7gTSzgvglmXJ0tlFACGH3cjEXAp3JLaUQZAYO,0,10-339-112-6016,4192.25,AUTOMOBILE,ies haggle fluffily along the slyly regular accounts. final platelets need to use blithely special requests


1500000

In [0]:
df_ord_cust=df_ord_sch.join(df_cust_sch.hint("merge"), df_ord_sch.O_CUSTKEY == df_cust_sch.C_CUSTKEY, "inner")
display(df_ord_cust.limit(10))
display(df_ord_cust.count())

O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
4273923,1,O,81245.06,1997-03-23,3-MEDIUM,Clerk#000000381,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
5133509,1,O,211713.58,1996-07-01,1-URGENT,Clerk#000000463,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
3868359,1,F,108622.27,1992-08-22,5-LOW,Clerk#000000536,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
4808192,1,O,101011.54,1996-06-29,2-HIGH,Clerk#000000473,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
579908,1,O,34137.49,1996-12-09,5-LOW,Clerk#000000783,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
454791,1,F,36659.79,1992-04-19,1-URGENT,Clerk#000000815,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
4243392,13,O,65546.41,1996-03-19,2-HIGH,Clerk#000000059,0,furiously special f,13,Customer#000000013,DM1A9KfdQq6LrEaV6K,0,10-761-547-5974,3857.34,BUILDING,ounts sleep carefully after the close frays. carefully bold notornis use ironic requests. blithely
1991461,13,F,202895.61,1993-08-20,3-MEDIUM,Clerk#000000415,0,furiously special f,13,Customer#000000013,DM1A9KfdQq6LrEaV6K,0,10-761-547-5974,3857.34,BUILDING,ounts sleep carefully after the close frays. carefully bold notornis use ironic requests. blithely
3540450,13,F,132222.61,1994-08-21,2-HIGH,Clerk#000000374,0,furiously special f,13,Customer#000000013,DM1A9KfdQq6LrEaV6K,0,10-761-547-5974,3857.34,BUILDING,ounts sleep carefully after the close frays. carefully bold notornis use ironic requests. blithely
1718016,13,F,240521.0,1994-08-30,5-LOW,Clerk#000000900,0,furiously special f,13,Customer#000000013,DM1A9KfdQq6LrEaV6K,0,10-761-547-5974,3857.34,BUILDING,ounts sleep carefully after the close frays. carefully bold notornis use ironic requests. blithely


1500000

In [0]:
display(df_ord_cust.limit(3))

O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
4273923,1,O,81245.06,1997-03-23,3-MEDIUM,Clerk#000000381,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
5133509,1,O,211713.58,1996-07-01,1-URGENT,Clerk#000000463,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
3868359,1,F,108622.27,1992-08-22,5-LOW,Clerk#000000536,0,furiously special f,1,Customer#000000001,"IVhzIApeRb ot,c,E",0,10-989-741-2988,711.56,BUILDING,"to the even, regular platelets. regular, ironic epitaphs nag e"
