<a href="https://colab.research.google.com/github/ARYANJATHAR/InfosysSpringboardProject/blob/main/Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PySpark

In [None]:
#1.Creating a PySpark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MyPySparkApp").getOrCreate()

In [None]:
#2.Creating a DataFrame in PySpark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CreateDataFrame").getOrCreate()

data = [("Aryan", 20), ("Kunal", 34), ("Rajesh", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df.show()

spark.stop()

+------+---+
|  Name|Age|
+------+---+
| Aryan| 20|
| Kunal| 34|
|Rajesh| 28|
+------+---+



In [None]:
#3.Selecting specific columns
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SelectColumns").getOrCreate()

data = [("Aryan", 20), ("Kunal", 34), ("Rajesh", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

name_df = df.select("Name")
name_df.show()



spark.stop()

+------+
|  Name|
+------+
| Aryan|
| Kunal|
|Rajesh|
+------+



In [None]:
#4.Filtering Data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FilterData").getOrCreate()

data = [("Aryan", 20), ("Kunal", 34), ("Rajesh", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

filtered_df = df.filter(df["Age"] > 27)
filtered_df.show()

spark.stop()

+------+---+
|  Name|Age|
+------+---+
| Kunal| 34|
|Rajesh| 28|
+------+---+



In [None]:
#5.Basic Aggregate Function
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("Aggregations").getOrCreate()

data = [("Aryan", 20), ("Kunal", 34), ("Rajesh", 28),("Karan", 30), ("Sumit", 12)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

average_age = df.agg(F.avg("Age"))
average_age.show()

max_age = df.agg(F.max("Age"))
max_age.show()

count_all = df.agg(F.count("*"))
count_all.show()

spark.stop()

+--------+
|avg(Age)|
+--------+
|    24.8|
+--------+

+--------+
|max(Age)|
+--------+
|      34|
+--------+

+--------+
|count(1)|
+--------+
|       5|
+--------+



In [None]:
#6.Grouping Data
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("GroupBy").getOrCreate()

data = [("Aryan", 20, "Pune"), ("Kunal", 34, "Mumbai"), ("Rajesh", 28, "Pune"),("Karan", 30,"Mumbai"), ("Sumit", 12,"Kolkata")]

columns = ["Name", "Age", "City"]
df = spark.createDataFrame(data, columns)

city_count = df.groupBy("City").agg(F.count("*").alias("Total people")).sort("City")
city_count.show()

spark.stop()


+-------+------------+
|   City|Total people|
+-------+------------+
|Kolkata|           1|
| Mumbai|           2|
|   Pune|           2|
+-------+------------+



In [None]:
#7.Reading Data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ReadCSV").getOrCreate()


data = [("Aryan", 20, "Pune"), ("Kunal", 34, "Mumbai"), ("Rajesh", 28, "Pune"),("Karan", 30,"Mumbai"), ("Sumit", 12,"Kolkata")]

with open("data.csv", "w") as f:
  f.write("Name,Age,City\n")
  for row in data:
      f.write(f"{row[0]},{row[1]},{row[2]}\n")


df = spark.read.csv("data.csv", header=True, inferSchema=True)
df.show()

spark.stop()

+------+---+-------+
|  Name|Age|   City|
+------+---+-------+
| Aryan| 20|   Pune|
| Kunal| 34| Mumbai|
|Rajesh| 28|   Pune|
| Karan| 30| Mumbai|
| Sumit| 12|Kolkata|
+------+---+-------+



In [None]:
#8.Adding a Column
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("AddColumn").getOrCreate()

data = [("Aryan", 20, ), ("Kunal", 34, ), ("Rajesh", 28, ),("Karan", 30,), ("Sumit", 12,)]

columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
df_with_age_plus_10 = df.withColumn("Age_plus_10", df["Age"] + 10)
df_with_age_plus_10.show()

spark.stop()

+------+---+-----------+
|  Name|Age|Age_plus_10|
+------+---+-----------+
| Aryan| 20|         30|
| Kunal| 34|         44|
|Rajesh| 28|         38|
| Karan| 30|         40|
| Sumit| 12|         22|
+------+---+-----------+



In [None]:
#9.Sorting Data
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SortData").getOrCreate()

data = [("Aryan", 20, ), ("Kunal", 34, ), ("Rajesh", 28, ),("Karan", 30,), ("Sumit", 12,)]

columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

df_sorted_age = df.orderBy("Age")
df_sorted_age.show()

df_sorted_age_desc = df.orderBy("Age", ascending = False)
df_sorted_age_desc.show()

spark.stop()

+------+---+
|  Name|Age|
+------+---+
| Sumit| 12|
| Aryan| 20|
|Rajesh| 28|
| Karan| 30|
| Kunal| 34|
+------+---+

+------+---+
|  Name|Age|
+------+---+
| Kunal| 34|
| Karan| 30|
|Rajesh| 28|
| Aryan| 20|
| Sumit| 12|
+------+---+



In [None]:
#10.Renaming Columns
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("RenameColumn").getOrCreate()
[("Aryan", 20, ), ("Kunal", 34, ), ("Rajesh", 28, ),("Karan", 30,), ("Sumit", 12,)]

df = spark.createDataFrame(data, columns)

df_renamed = df.withColumnRenamed("Age", "Person_Age")
df_renamed.show()

spark.stop()

+------+----------+
|  Name|Person_Age|
+------+----------+
| Aryan|        20|
| Kunal|        34|
|Rajesh|        28|
| Karan|        30|
| Sumit|        12|
+------+----------+

