In [21]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/itv015970/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [22]:
order_schema = "order_id long, order_date string, customer_id long, order_status string" 

In [23]:
orders_new_df = spark.read \
.format("csv") \
.schema(order_schema) \
.load("/public/trendytech/retail_db/ordersnew")

In [24]:
 orders_new_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [25]:
orders_new_df.groupBy("order_status").count().collect()

[Row(order_status='PENDING_PAYMENT', count=5636250),
 Row(order_status='COMPLETE', count=46008801),
 Row(order_status='ON_HOLD', count=1424250),
 Row(order_status='PAYMENT_REVIEW', count=273375),
 Row(order_status='PROCESSING', count=3103125),
 Row(order_status='CLOSED', count=2833500),
 Row(order_status='SUSPECTED_FRAUD', count=584250),
 Row(order_status='PENDING', count=2853750),
 Row(order_status='CANCELED', count=535500)]

In [26]:
customers_schema = "customerid long, customer_fname string, customer_lname string, username string, password string, address string, city string, state string, pincode long"

In [27]:
customers_df = spark.read \
.format("csv") \
.schema(customers_schema) \
.load("/public/trendytech/retail_db/customers")

In [28]:
customers_df.show()

+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|customerid|customer_fname|customer_lname| username| password|             address|         city|state|pincode|
+----------+--------------+--------------+---------+---------+--------------------+-------------+-----+-------+
|         1|       Richard|     Hernandez|XXXXXXXXX|XXXXXXXXX|  6303 Heather Plaza|  Brownsville|   TX|  78521|
|         2|          Mary|       Barrett|XXXXXXXXX|XXXXXXXXX|9526 Noble Embers...|    Littleton|   CO|  80126|
|         3|           Ann|         Smith|XXXXXXXXX|XXXXXXXXX|3422 Blue Pioneer...|       Caguas|   PR|    725|
|         4|          Mary|         Jones|XXXXXXXXX|XXXXXXXXX|  8324 Little Common|   San Marcos|   CA|  92069|
|         5|        Robert|        Hudson|XXXXXXXXX|XXXXXXXXX|10 Crystal River ...|       Caguas|   PR|    725|
|         6|          Mary|         Smith|XXXXXXXXX|XXXXXXXXX|3151 Sleepy Quail...|      Passaic|   NJ| 

In [29]:
mapping_schema = "status string, code int"

In [30]:
mapping_df = spark.read \
.format("csv") \
.option("delimiter", "|") \
.schema(mapping_schema) \
.load("/public/trendytech/datasets/mapping_data")

In [31]:
mapping_df.show()

+---------------+----+
|         status|code|
+---------------+----+
|PENDING_PAYMENT|   1|
|       COMPLETE|   2|
|        ON_HOLD|   3|
| PAYMENT_REVIEW|   4|
|     PROCESSING|   5|
|         CLOSED|   6|
|SUSPECTED_FRAUD|   7|
|        PENDING|   8|
|       CANCELED|   9|
+---------------+----+



In [32]:
# disable broadcast join
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [34]:
orders_new_df.join(mapping_df, orders_new_df.order_status == mapping_df.status).write.format("noop").mode("overwrite").save()