# 1.What are the different ways to handle row duplication in a PySpark DataFrame

In [1]:
# import pyspark
# from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
from pyspark_start import *
# spark = SparkSession.builder.appName('ProjectPro').getOrCreate()

data = [("James", "Sales", 3000), \

    ("Michael", "Sales", 4600), \

    ("Robert", "Sales", 4100), \

    ("Maria", "Finance", 3000), \

    ("James", "Sales", 3000), \

    ("Scott", "Finance", 3300), \

    ("Jen", "Finance", 3900), \

    ("Jeff", "Marketing", 3000), \

    ("Kumar", "Marketing", 2000), \

    ("Saif", "Sales", 4100) \

  ]

column= ["employee_name", "department", "salary"]

df = spark.createDataFrame(data = data, schema = column)

In [None]:
df.printSchema()

In [2]:
df.show(truncate=False)

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [3]:
 #Distinct

distinctDF = df.distinct()

print("Distinct count: "+str(distinctDF.count()))

distinctDF.show(truncate=False)

Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Kumar        |Marketing |2000  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [4]:
#Drop duplicates
df2 = df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
df2.show(truncate=False)

Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Kumar        |Marketing |2000  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [5]:
#Drop duplicates on selected columns

dropDisDF = df.dropDuplicates(["department","salary"])

print("Distinct count of department salary : "+str(dropDisDF.count()))

dropDisDF.show(truncate=False)


Distinct count of department salary : 8
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Kumar        |Marketing |2000  |
|Jeff         |Marketing |3000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Michael      |Sales     |4600  |
+-------------+----------+------+



# 2. Explain PySpark UDF with the help of an example

In [6]:
column = ["Seqno","Name"]

data = [("1", "john jones"),

    ("2", "tracey smith"),

    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=column)

df.show(truncate=False)

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [9]:
def convertCase(str):

    resStr=""

    arr = str.split(" ")

    for x in arr:

       resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "

    return resStr  

In [11]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

# Converting function to UDF 
convertUDF = udf(lambda z: convertCase(z),StringType())


In [12]:
df.select(col("Seqno"), \
    convertUDF(col("Name")).alias("Name") ) \
   .show(truncate=False)

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+



# What is PySpark SQL?

In [13]:
df.createOrReplaceTempView("STUDENTS")
df_new = spark.sql("SELECT * from STUDENTS")
df_new.printSchema()

root
 |-- Seqno: string (nullable = true)
 |-- Name: string (nullable = true)



In [14]:
df_new.show()

+-----+------------+
|Seqno|        Name|
+-----+------------+
|    1|  john jones|
|    2|tracey smith|
|    3| amy sanders|
+-----+------------+



# How can we create DataFrames in PySpark?

In [None]:
data = [('Harry', 20),
       ('Ron', 20),
       ('Hermoine', 20)]
columns = ["Name","Age"]
df = spark.createDataFrame(data=data, schema = columns)

# What is PySpark Architecture?

![image.png](attachment:image.png)

# What PySpark DAGScheduler?

![image.png](attachment:image.png)

# What are the key advantages of PySpark RDD?

Following is the list of key advantages of PySpark RDD:

Immutability: The PySpark RDDs are immutable. If you create them once, you cannot modify them later. You have to create a new RDD whenever you try to apply any transformation operations on the RDDs.

Fault Tolerance: The PySpark RDD provides fault tolerance features. Whenever an operation fails, the data gets automatically reloaded from other available partitions. This provides a seamless experience of execution of the PySpark applications.

Partitioning: When we create an RDD from any data, the elements in the RDD are partitioned to the cores available by default.

Lazy Evolution: PySpark RDD follows the lazy evolution process. In PySpark RDD, the transformation operations are not performed as soon as they are encountered. The operations would be stored in the DAG and are evaluated once it finds the first RDD action.

In-Memory Processing: The PySpark RDD is used to help in loading data from the disk to the memory. You can persist RDDs in the memory for reusing the computations.