In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
    builder. \
    config('spark.ui.port','0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    master('yarn'). \
    getOrCreate()

In [2]:
spark

In [8]:
!hdfs dfs -ls /public/trendytech/retail_db/order_items

Found 1 items
-rw-r--r--   3 itv005857 supergroup    5408880 2023-04-26 16:47 /public/trendytech/retail_db/order_items/part-00000


In [9]:
!hdfs dfs -cat /public/trendytech/retail_db/order_items/part-00000|head

1,1,957,1,299.98,299.98
2,2,1073,1,199.99,199.99
3,2,502,5,250.0,50.0
4,2,403,1,129.99,129.99
5,4,897,2,49.98,24.99
6,4,365,5,299.95,59.99
7,4,502,3,150.0,50.0
8,4,1014,4,199.92,49.98
9,5,957,1,299.98,299.98
10,5,365,5,299.95,59.99
cat: Unable to write to output stream.


In [11]:
raw_df  = spark.read.csv("/public/trendytech/retail_db/order_items/*", inferSchema="true")

In [12]:
raw_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)



In [15]:
refined_df = raw_df.toDF("order_item_id", "order_id", "product_id", "quantity", "subtotal", "product_price")

In [16]:
refined_df.show()

+-------------+--------+----------+--------+--------+-------------+
|order_item_id|order_id|product_id|quantity|subtotal|product_price|
+-------------+--------+----------+--------+--------+-------------+
|            1|       1|       957|       1|  299.98|       299.98|
|            2|       2|      1073|       1|  199.99|       199.99|
|            3|       2|       502|       5|   250.0|         50.0|
|            4|       2|       403|       1|  129.99|       129.99|
|            5|       4|       897|       2|   49.98|        24.99|
|            6|       4|       365|       5|  299.95|        59.99|
|            7|       4|       502|       3|   150.0|         50.0|
|            8|       4|      1014|       4|  199.92|        49.98|
|            9|       5|       957|       1|  299.98|       299.98|
|           10|       5|       365|       5|  299.95|        59.99|
|           11|       5|      1014|       2|   99.96|        49.98|
|           12|       5|       957|       1|  29

In [18]:
df1 = refined_df.drop("subtotal")

In [19]:
df1.show()

+-------------+--------+----------+--------+-------------+
|order_item_id|order_id|product_id|quantity|product_price|
+-------------+--------+----------+--------+-------------+
|            1|       1|       957|       1|       299.98|
|            2|       2|      1073|       1|       199.99|
|            3|       2|       502|       5|         50.0|
|            4|       2|       403|       1|       129.99|
|            5|       4|       897|       2|        24.99|
|            6|       4|       365|       5|        59.99|
|            7|       4|       502|       3|         50.0|
|            8|       4|      1014|       4|        49.98|
|            9|       5|       957|       1|       299.98|
|           10|       5|       365|       5|        59.99|
|           11|       5|      1014|       2|        49.98|
|           12|       5|       957|       1|       299.98|
|           13|       5|       403|       1|       129.99|
|           14|       7|      1073|       1|       199.9

In [21]:
from pyspark.sql.functions import *
df1.select("*",expr("quantity * product_price as total")).show() 

+-------------+--------+----------+--------+-------------+------+
|order_item_id|order_id|product_id|quantity|product_price| total|
+-------------+--------+----------+--------+-------------+------+
|            1|       1|       957|       1|       299.98|299.98|
|            2|       2|      1073|       1|       199.99|199.99|
|            3|       2|       502|       5|         50.0| 250.0|
|            4|       2|       403|       1|       129.99|129.99|
|            5|       4|       897|       2|        24.99| 49.98|
|            6|       4|       365|       5|        59.99|299.95|
|            7|       4|       502|       3|         50.0| 150.0|
|            8|       4|      1014|       4|        49.98|199.92|
|            9|       5|       957|       1|       299.98|299.98|
|           10|       5|       365|       5|        59.99|299.95|
|           11|       5|      1014|       2|        49.98| 99.96|
|           12|       5|       957|       1|       299.98|299.98|
|         

In [22]:
df1.selectExpr("*", "quantity * product_price as total").show()

+-------------+--------+----------+--------+-------------+------+
|order_item_id|order_id|product_id|quantity|product_price| total|
+-------------+--------+----------+--------+-------------+------+
|            1|       1|       957|       1|       299.98|299.98|
|            2|       2|      1073|       1|       199.99|199.99|
|            3|       2|       502|       5|         50.0| 250.0|
|            4|       2|       403|       1|       129.99|129.99|
|            5|       4|       897|       2|        24.99| 49.98|
|            6|       4|       365|       5|        59.99|299.95|
|            7|       4|       502|       3|         50.0| 150.0|
|            8|       4|      1014|       4|        49.98|199.92|
|            9|       5|       957|       1|       299.98|299.98|
|           10|       5|       365|       5|        59.99|299.95|
|           11|       5|      1014|       2|        49.98| 99.96|
|           12|       5|       957|       1|       299.98|299.98|
|         

In [23]:
products_df  = spark.read.csv("/public/trendytech/retail_db/products/*", inferSchema="true")

In [24]:
products_df.show()

+---+---+--------------------+----+------+--------------------+
|_c0|_c1|                 _c2| _c3|   _c4|                 _c5|
+---+---+--------------------+----+------+--------------------+
|  1|  2|Quest Q64 10 FT. ...|null| 59.98|http://images.acm...|
|  2|  2|Under Armour Men'...|null|129.99|http://images.acm...|
|  3|  2|Under Armour Men'...|null| 89.99|http://images.acm...|
|  4|  2|Under Armour Men'...|null| 89.99|http://images.acm...|
|  5|  2|Riddell Youth Rev...|null|199.99|http://images.acm...|
|  6|  2|Jordan Men's VI R...|null|134.99|http://images.acm...|
|  7|  2|Schutt Youth Recr...|null| 99.99|http://images.acm...|
|  8|  2|Nike Men's Vapor ...|null|129.99|http://images.acm...|
|  9|  2|Nike Adult Vapor ...|null|  50.0|http://images.acm...|
| 10|  2|Under Armour Men'...|null|129.99|http://images.acm...|
| 11|  2|Fitness Gear 300 ...|null|209.99|http://images.acm...|
| 12|  2|Under Armour Men'...|null|139.99|http://images.acm...|
| 13|  2|Under Armour Men'...|null| 89.9

In [26]:
refined_product_df = products_df.toDF("product_id", "product_category_id","product_name","product_description","product_price", "product_image")

In [29]:
refined_product_df.show()

+----------+-------------------+--------------------+-------------------+-------------+--------------------+
|product_id|product_category_id|        product_name|product_description|product_price|       product_image|
+----------+-------------------+--------------------+-------------------+-------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|        59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|               null|       129.99|http://images.acm...|
|         3|                  2|Under Armour Men'...|               null|        89.99|http://images.acm...|
|         4|                  2|Under Armour Men'...|               null|        89.99|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|               null|       199.99|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|               null|       134.99|http://images.acm...|
|         7|       

In [31]:
refined_product_df.withColumn("product_price",expr("product_price * 1.2")).show()

+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|product_id|product_category_id|        product_name|product_description|     product_price|       product_image|
+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|            71.976|http://images.acm...|
|         2|                  2|Under Armour Men'...|               null|           155.988|http://images.acm...|
|         3|                  2|Under Armour Men'...|               null|107.98799999999999|http://images.acm...|
|         4|                  2|Under Armour Men'...|               null|107.98799999999999|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|               null|           239.988|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|               null|           161.

In [33]:
df2 = refined_product_df.withColumn("product_price", expr("CASE WHEN product_name like '%Nike%' THEN product_price * 1.2 WHEN product_name like '%Armour%' THEN product_price * 1.1 ELSE product_price END"))

In [34]:
df2.show()

+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|product_id|product_category_id|        product_name|product_description|     product_price|       product_image|
+----------+-------------------+--------------------+-------------------+------------------+--------------------+
|         1|                  2|Quest Q64 10 FT. ...|               null|             59.98|http://images.acm...|
|         2|                  2|Under Armour Men'...|               null|142.98900000000003|http://images.acm...|
|         3|                  2|Under Armour Men'...|               null|            98.989|http://images.acm...|
|         4|                  2|Under Armour Men'...|               null|            98.989|http://images.acm...|
|         5|                  2|Riddell Youth Rev...|               null|            199.99|http://images.acm...|
|         6|                  2|Jordan Men's VI R...|               null|            134

In [3]:
#### Removing Duplicates from Dataframe ####
myList = [
    (1,"Kapil",34),
    (1,"Kapil",34),
    (1,"Satish",26),
    (2,"Satish",26)
]

In [4]:
df= spark.createDataFrame(myList).toDF("id","name","age")

In [5]:
df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  1| Kapil| 34|
|  1|Satish| 26|
|  2|Satish| 26|
+---+------+---+



In [6]:
df1= df.distinct()

In [7]:
df1.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  2|Satish| 26|
|  1|Satish| 26|
+---+------+---+



In [8]:
new_df1= df.dropDuplicates().show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  2|Satish| 26|
|  1|Satish| 26|
+---+------+---+



In [10]:
new_df2 = df.dropDuplicates(["name","age"]).show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1| Kapil| 34|
|  1|Satish| 26|
+---+------+---+



In [11]:
new_df3 = df.dropDuplicates(["id"]).show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|Satish| 26|
|  2|Satish| 26|
+---+------+---+

