# Manipulating, Droping, Sorting, Aggregations, Joining, GroupBy  DataFrame

In [0]:
from pyspark.sql import SparkSession


In [0]:
spark = SparkSession.builder \
    .appName("DataFrame Coding challenge") \
    .getOrCreate()

In [0]:
data = [("John", "Doe", 28),
        ("Jane", "Smith", 35),
        ("Bob", "Johnson", 40),
        ("Alice", "Williams", 30)]
columns = ["First_Name", "Last_Name", "Age"]
df = spark.createDataFrame(data, columns)

In [0]:
print("Original DataFrame:")
df.show()

Original DataFrame:
+----------+---------+---+
|First_Name|Last_Name|Age|
+----------+---------+---+
|      John|      Doe| 28|
|      Jane|    Smith| 35|
|       Bob|  Johnson| 40|
|     Alice| Williams| 30|
+----------+---------+---+



# Manipulating data

In [0]:
from pyspark.sql.functions import col

In [0]:
df = df.withColumn("Full_Name", col("First_Name") + " " + col("Last_Name"))

In [0]:
df.show()

+----------+---------+---+---------+
|First_Name|Last_Name|Age|Full_Name|
+----------+---------+---+---------+
|      John|      Doe| 28|     NULL|
|      Jane|    Smith| 35|     NULL|
|       Bob|  Johnson| 40|     NULL|
|     Alice| Williams| 30|     NULL|
+----------+---------+---+---------+



# Dropping columns

In [0]:
df = df.drop("Full_Name")
df.show()

+----------+---------+---+
|First_Name|Last_Name|Age|
+----------+---------+---+
|      John|      Doe| 28|
|      Jane|    Smith| 35|
|       Bob|  Johnson| 40|
|     Alice| Williams| 30|
+----------+---------+---+



# Sorting DataFrame by Age in descending order


In [0]:
df = df.orderBy(col("Age").desc()).show()

+----------+---------+---+
|First_Name|Last_Name|Age|
+----------+---------+---+
|       Bob|  Johnson| 40|
|      Jane|    Smith| 35|
|     Alice| Williams| 30|
|      John|      Doe| 28|
+----------+---------+---+



# Joining the DataFrame 

In [0]:
data2 = [("John", 1000),
         ("Jane", 2000),
         ("Bob", 1500),
         ("Alice", 3000)]
columns2 = ["First_Name", "Salary"]
df2 = spark.createDataFrame(data2, columns2)

print("\nSecond DataFrame:")
df2.show()




Second DataFrame:
+----------+------+
|First_Name|Salary|
+----------+------+
|      John|  1000|
|      Jane|  2000|
|       Bob|  1500|
|     Alice|  3000|
+----------+------+



In [0]:
joined_df = df.join(df2, on="First_Name", how="inner")


In [0]:
print("\nJoined DataFrame with another DataFrame:")
joined_df.show()


Joined DataFrame with another DataFrame:
+----------+---------+---+------+
|First_Name|Last_Name|Age|Salary|
+----------+---------+---+------+
|     Alice| Williams| 30|  3000|
|       Bob|  Johnson| 40|  1500|
|      Jane|    Smith| 35|  2000|
|      John|      Doe| 28|  1000|
+----------+---------+---+------+



# Aggregations


In [0]:
average_age = df.agg({"Age": "avg"}).collect()[0][0]

In [0]:
average_age

33.25

# Grouping by Age and counting occurrences


In [0]:
grouped_df = df.groupBy("Age").count()

In [0]:
grouped_df.show()

+---+-----+
|Age|count|
+---+-----+
| 28|    1|
| 35|    1|
| 40|    1|
| 30|    1|
+---+-----+

