In [None]:
# This file covers basic steps to get started with pyspark

In [None]:
# Basic Tasks

In [12]:
# Task 1: Read and display data

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('first').getOrCreate()
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35),("Tony",50)]
df=spark.createDataFrame(data,["Name","Age"])
# df.show()

In [2]:
# Task 2 : Filtering Data
df.filter(df.Age>30).show()

AttributeError: 'DataFrame' object has no attribute 'Age'

In [19]:
# Task : Grouping and Aggregation
df.groupBy(df.Age).count().show()

+---+-----+
|Age|count|
+---+-----+
| 25|    1|
| 30|    1|
| 35|    1|
| 50|    1|
+---+-----+



In [None]:
# Intermediate Tasks:

In [20]:
# Task Joining dataframes
data2 = [("Alice", "Engineer"), ("Bob", "Doctor"), ("Charlie", "Teacher")]
df2 = spark.createDataFrame(data2, ["Name", "Occupation"])
df.join(df2,"Name").show()
# Alice,(25,Engineer)



+-------+---+----------+
|   Name|Age|Occupation|
+-------+---+----------+
|  Alice| 25|  Engineer|
|    Bob| 30|    Doctor|
|Charlie| 35|   Teacher|
+-------+---+----------+



                                                                                

In [22]:
# Task : Handling Missing Data or remove the record that contain null value
data_with_null = [("Alice", 25), ("Bob", None), ("Charlie", 35)]
df_with_null = spark.createDataFrame(data_with_null, ["Name", "Age"])
df_with_null.na.drop().show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|Charlie| 35|
+-------+---+



In [None]:
# Advanced Tasks:

In [25]:
# Task: User-Defined Functions (UDFs): 
# Convert & Create upperCase column for Name column

from pyspark.sql.functions import udf
upper_udf=udf(lambda name:name.upper())
df.withColumn("UpperName",upper_udf(df["Name"])).show()
# df.show()

                                                                                

+-------+---+---------+
|   Name|Age|UpperName|
+-------+---+---------+
|  Alice| 25|    ALICE|
|    Bob| 30|      BOB|
|Charlie| 35|  CHARLIE|
|   Tony| 50|     TONY|
+-------+---+---------+



In [30]:
# Window Functions : generate rownumber based on Age class
from pyspark.sql.functions import row_number
from pyspark.sql import Window
orderBySpec=Window.orderBy("Age")
df.withColumn("Row_num",row_number().over(orderBySpec)).show()

24/02/01 22:39:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/02/01 22:39:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/02/01 22:39:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+---+-------+
|   Name|Age|Row_num|
+-------+---+-------+
|  Alice| 25|      1|
|    Bob| 30|      2|
|Charlie| 35|      3|
|   Tony| 50|      4|
+-------+---+-------+



In [2]:
# Groupby on multiple columns
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
# df.printSchema()
# df.show(truncate=False)
df.groupBy("employee_name", "department").sum("salary").show()


[Stage 2:>                                                          (0 + 4) / 4]

+-------------+----------+-----------+
|employee_name|department|sum(salary)|
+-------------+----------+-----------+
|      Michael|     Sales|       4600|
|        James|     Sales|       6000|
|        Maria|   Finance|       3000|
|       Robert|     Sales|       4100|
|        Scott|   Finance|       3300|
|          Jen|   Finance|       3900|
|         Jeff| Marketing|       3000|
|         Saif|     Sales|       4100|
|        Kumar| Marketing|       2000|
+-------------+----------+-----------+



                                                                                

In [6]:
# Define nested structType
# Defining schema using nested StructType
from pyspark.sql.types import *

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),None,"F",-1)
  ]
structSchema=StructType([
    StructField("fullname",
            StructType([
                StructField("firstname",StringType(),True),
                StructField("middlename",StringType(),True),
                StructField("lastname",StringType(),True)
    ])),
    StructField("regno",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("salary",IntegerType(),True)
])

df = spark.createDataFrame(data=structureData,schema=structSchema)
df.select("fullname.firstname","fullname.middlename","fullname.lastname","regno","sex").show(truncate=False)
#df.printSchema()

+---------+----------+--------+-----+---+
|firstname|middlename|lastname|regno|sex|
+---------+----------+--------+-----+---+
|James    |          |Smith   |36636|M  |
|Michael  |Rose      |        |40288|M  |
|Robert   |          |Williams|42114|M  |
|Maria    |Anne      |Jones   |39192|F  |
|Jen      |Mary      |Brown   |NULL |F  |
+---------+----------+--------+-----+---+



In [1]:
# Read and write into parquet file
data = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=data, schema = schema)
'''
Write the data into different file formats into HDFS Location
'''
# df.write.parquet("output.parquet", compression="snappy")
# df.write.csv('Files/output.csv')
df.write.orc('output.orc')
df.write.json('output.json')

                                                                                

In [13]:
# add constant columns into dataframe using lit()
from pyspark.sql.functions import lit

data = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000)
  ]
schema = ["employee_name", "department", "salary"]
df=spark.createDataFrame(data,schema)
df.select("employee_name", "department", "salary",lit("Salary").alias("Test_Col")).show()

+-------------+----------+------+--------+
|employee_name|department|salary|Test_Col|
+-------------+----------+------+--------+
|        James|     Sales|  3000|  Salary|
|      Michael|     Sales|  4600|  Salary|
|       Robert|     Sales|  4100|  Salary|
|        Maria|   Finance|  3000|  Salary|
+-------------+----------+------+--------+

