<a href="https://colab.research.google.com/github/ajinkyagh/Python_Practice/blob/main/Python_PySpark_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
#EDA & Data Manipulation
#Creating a dataframe from List
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EDA Methods").getOrCreate()

data = [("Alice",25),("Bob",30), ("Charlie", None ), ("John",30)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|  25|
|    Bob|  30|
|Charlie|NULL|
|   John|  30|
+-------+----+



In [45]:
#read CSV file
from google.colab import drive
drive.mount('/content/gdrive')

df_customers = spark.read.csv('/content/gdrive/My Drive/Colab Notebooks/dataset/customers.csv', header=True, inferSchema=True)
df_customers.show()


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
+-----------+-------+-----------+------------------+
|customer_id|   name|signup_date|             email|
+-----------+-------+-----------+------------------+
|          1|  Alice| 2021-01-01| user1@example.com|
|          2|    Bob| 2021-04-01| user2@example.com|
|          3|Charlie| 2021-06-30| user3@example.com|
|          4|  David| 2021-09-28|              NULL|
|          5|    Eva| 2021-12-27| user5@example.com|
|          6|  Frank| 2022-03-27| user6@example.com|
|          7|  Grace| 2022-06-25| user7@example.com|
|          8|  Helen| 2022-09-23| user8@example.com|
|          9|    Ian| 2022-12-22| user9@example.com|
|         10|   Jane| 2023-03-22|user10@example.com|
+-----------+-------+-----------+------------------+



In [46]:
#Read a parquet File into the a data frame
# df_orders = spark.read.parquet('/content/gdrive/My Drive/Colab Notebooks/dataset/orders.csv')

#Read a json File into the a data frame
#df = spark.read.json("path")

In [47]:
#Basic EDA Methods
df.show(5)

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|  25|
|    Bob|  30|
|Charlie|NULL|
|   John|  30|
+-------+----+



In [48]:
df.columns

['Name', 'Age']

In [49]:
df.count()

4

In [50]:
#Displays the schema
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)



In [51]:
#Computes summary statistics for numerical columns.
df.describe().show()

+-------+-----+------------------+
|summary| Name|               Age|
+-------+-----+------------------+
|  count|    4|                 3|
|   mean| NULL|28.333333333333332|
| stddev| NULL| 2.886751345948129|
|    min|Alice|                25|
|    max| John|                30|
+-------+-----+------------------+



In [52]:
#Commutes column wise missing values.
from pyspark.sql.functions import col,sum
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+----+---+
|Name|Age|
+----+---+
|   0|  1|
+----+---+



In [53]:
#Filtering and Selecting Data
df.show(5)

#Filter rows where Age is greater than 25
df.filter(df.Age > 25).show()

+-------+----+
|   Name| Age|
+-------+----+
|  Alice|  25|
|    Bob|  30|
|Charlie|NULL|
|   John|  30|
+-------+----+

+----+---+
|Name|Age|
+----+---+
| Bob| 30|
|John| 30|
+----+---+



In [54]:
#Filter using multiple conditions
df.filter((df.Age>=25) & (df.Name.startswith("A"))).show()

df.filter((df.Age>=25) & (df.Name=="Bob")).show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 25|
+-----+---+

+----+---+
|Name|Age|
+----+---+
| Bob| 30|
+----+---+



In [55]:
#Select Specific Columns
df.select("Name").show()

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
|   John|
+-------+



In [56]:
#Add a new column
from pyspark.sql.functions import lit
df = df.withColumn("Country", lit("USA"))
df.show()

+-------+----+-------+
|   Name| Age|Country|
+-------+----+-------+
|  Alice|  25|    USA|
|    Bob|  30|    USA|
|Charlie|NULL|    USA|
|   John|  30|    USA|
+-------+----+-------+



In [57]:
#rename a column
df = df.withColumnRenamed("Country", "Nationality")
df.show()

+-------+----+-----------+
|   Name| Age|Nationality|
+-------+----+-----------+
|  Alice|  25|        USA|
|    Bob|  30|        USA|
|Charlie|NULL|        USA|
|   John|  30|        USA|
+-------+----+-----------+



In [58]:
#Drop a column
#df = df.drop("Nationality")
#df.show()

In [59]:
#Aggregations and Grouping
#Counts the number of occurrences per category
df.groupBy("Age").count().show()

+----+-----+
| Age|count|
+----+-----+
|  25|    1|
|  30|    2|
|NULL|    1|
+----+-----+



In [60]:
#Finds the average age per country
df.groupBy("Nationality").agg({"Age": "avg"}).show()

+-----------+------------------+
|Nationality|          avg(Age)|
+-----------+------------------+
|        USA|28.333333333333332|
+-----------+------------------+



In [61]:
#Computes multiple aggregations
from pyspark.sql.functions import min, max, avg
df.groupBy("Nationality").agg(avg("Age"), min("Age"), max("Age")).show()

+-----------+------------------+--------+--------+
|Nationality|          avg(Age)|min(Age)|max(Age)|
+-----------+------------------+--------+--------+
|        USA|28.333333333333332|      25|      30|
+-----------+------------------+--------+--------+



In [62]:
#Sorting and Ranking
#Sorts DatFrame in ascending order.
df.show()

df.orderBy(df.Age.asc()).show()

+-------+----+-----------+
|   Name| Age|Nationality|
+-------+----+-----------+
|  Alice|  25|        USA|
|    Bob|  30|        USA|
|Charlie|NULL|        USA|
|   John|  30|        USA|
+-------+----+-----------+

+-------+----+-----------+
|   Name| Age|Nationality|
+-------+----+-----------+
|Charlie|NULL|        USA|
|  Alice|  25|        USA|
|   John|  30|        USA|
|    Bob|  30|        USA|
+-------+----+-----------+



In [63]:
#Sorts DatFrame in desc order.
df.orderBy(df.Age.desc()).show()

+-------+----+-----------+
|   Name| Age|Nationality|
+-------+----+-----------+
|   John|  30|        USA|
|    Bob|  30|        USA|
|  Alice|  25|        USA|
|Charlie|NULL|        USA|
+-------+----+-----------+



In [64]:
#ADDS a row number column(ranking)
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.orderBy("Age")
df.withColumn("row_number", row_number().over(windowSpec)).show()

+-------+----+-----------+----------+
|   Name| Age|Nationality|row_number|
+-------+----+-----------+----------+
|Charlie|NULL|        USA|         1|
|  Alice|  25|        USA|         2|
|    Bob|  30|        USA|         3|
|   John|  30|        USA|         4|
+-------+----+-----------+----------+



In [65]:
#Handling Missing Data
#Drops rows with any null values
df.na.drop().show()

+-----+---+-----------+
| Name|Age|Nationality|
+-----+---+-----------+
|Alice| 25|        USA|
|  Bob| 30|        USA|
| John| 30|        USA|
+-----+---+-----------+



In [66]:
#Fills null values with a default value.
df.na.fill({"Age": 30}).show()

+-------+---+-----------+
|   Name|Age|Nationality|
+-------+---+-----------+
|  Alice| 25|        USA|
|    Bob| 30|        USA|
|Charlie| 30|        USA|
|   John| 30|        USA|
+-------+---+-----------+



In [67]:
#Replaces specific values.
df = df.replace("USA", "United States")

In [68]:
df.show()

+-------+----+-------------+
|   Name| Age|  Nationality|
+-------+----+-------------+
|  Alice|  25|United States|
|    Bob|  30|United States|
|Charlie|NULL|United States|
|   John|  30|United States|
+-------+----+-------------+



Feature Engineering