In [None]:
# This file covers basic steps to get started with pyspark
# Basic Tasks

In [12]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Fundementals').getOrCreate()

In [61]:
# Task 1: Read and display data

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('first').getOrCreate()
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35),("Tony",50)]
df=spark.createDataFrame(data,["Name","Age"])
df.filter(df.Age>30).show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 35|
|   Tony| 50|
+-------+---+



In [4]:
# Task 2 : Filtering Data
# df.filter(df.Age>30).show()
;

''

In [6]:
# Task : Grouping and Aggregation
# df.groupBy(df.Age).count().show()
;

''

In [None]:
# Intermediate Tasks:

In [3]:
# Task Joining dataframes
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35),("Tony",50)]
data2 = [("Alice", "Engineer"), ("Bob", "Doctor"), ("Charlie", "Teacher")]

df=spark.createDataFrame(data,["Name","Age"])
df2 = spark.createDataFrame(data2, ["Name", "Occupation"])

df.join(df2,"Name").show()
df.join(df2,df.Name==df2.Name,"left").show()
df.join(df2,df.Name==df2.Name,"right").show()
df.join(df2,df.Name==df2.Name,"full").show()
df.join(df2,df.Name==df2.Name,"leftanti").show()


                                                                                

+-------+---+----------+
|   Name|Age|Occupation|
+-------+---+----------+
|  Alice| 25|  Engineer|
|    Bob| 30|    Doctor|
|Charlie| 35|   Teacher|
+-------+---+----------+



                                                                                

+-------+---+-------+----------+
|   Name|Age|   Name|Occupation|
+-------+---+-------+----------+
|  Alice| 25|  Alice|  Engineer|
|    Bob| 30|    Bob|    Doctor|
|Charlie| 35|Charlie|   Teacher|
|   Tony| 50|   NULL|      NULL|
+-------+---+-------+----------+



                                                                                

+-------+---+-------+----------+
|   Name|Age|   Name|Occupation|
+-------+---+-------+----------+
|  Alice| 25|  Alice|  Engineer|
|    Bob| 30|    Bob|    Doctor|
|Charlie| 35|Charlie|   Teacher|
+-------+---+-------+----------+



                                                                                

+-------+---+-------+----------+
|   Name|Age|   Name|Occupation|
+-------+---+-------+----------+
|  Alice| 25|  Alice|  Engineer|
|    Bob| 30|    Bob|    Doctor|
|Charlie| 35|Charlie|   Teacher|
|   Tony| 50|   NULL|      NULL|
+-------+---+-------+----------+





+----+---+
|Name|Age|
+----+---+
|Tony| 50|
+----+---+



                                                                                

In [19]:
# Task : Handling Missing Data or remove the record that contain null value
data_with_null = [("Alice", 25), ("Bob", None), ("Charlie", 35)]
df_with_null = spark.createDataFrame(data_with_null, ["Name", "Age"])
df_with_null.na.drop()
# .show()
;

''

In [None]:
# Advanced Tasks:

In [20]:
# Task: User-Defined Functions (UDFs): 
# Convert & Create upperCase column for Name column

from pyspark.sql.functions import udf
upper_udf=udf(lambda name:name.upper())
df.withColumn("UpperName",upper_udf(df["Name"]))
# .show()
# df.show()

DataFrame[Name: string, Age: bigint, UpperName: string]

In [10]:
# Window Functions : generate rownumber based on Age class
from pyspark.sql.functions import row_number
from pyspark.sql import Window
orderBySpec=Window.orderBy("Age")
df.withColumn("Row_num",row_number().over(orderBySpec)) \
# .show()

DataFrame[Name: string, Age: bigint, Row_num: int]

In [11]:
# Groupby on multiple columns
simpleData = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=simpleData, schema = schema)
# df.printSchema()
# df.show(truncate=False)
df.groupBy("employee_name", "department").sum("salary")\
# .show()

DataFrame[employee_name: string, department: string, sum(salary): bigint]

In [14]:
# Define nested structType
# Defining schema using nested StructType
from pyspark.sql.types import *

structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),None,"F",-1)
  ]
structSchema=StructType([
    StructField("fullname",
            StructType([
                StructField("firstname",StringType(),True),
                StructField("middlename",StringType(),True),
                StructField("lastname",StringType(),True)
    ])),
    StructField("regno",StringType(),True),
    StructField("sex",StringType(),True),
    StructField("salary",IntegerType(),True)
])

df = spark.createDataFrame(data=structureData,schema=structSchema)
df.select("fullname.firstname","fullname.middlename","fullname.lastname","regno","sex")\
# .show(truncate=False)
#df.printSchema()

DataFrame[firstname: string, middlename: string, lastname: string, regno: string, sex: string]

In [1]:
# Read and write into parquet file
data = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 3000),
    ("Scott", "Finance", 3300),
    ("Jen", "Finance", 3900),
    ("Jeff", "Marketing", 3000),
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
df = spark.createDataFrame(data=data, schema = schema)
'''
Write the data into different file formats into HDFS Location
'''
# df.write.parquet("output.parquet", compression="snappy")
# df.write.csv('Files/output.csv')
df.write.orc('output.orc')
df.write.json('output.json')

                                                                                

In [13]:
# add constant columns into dataframe using lit()
from pyspark.sql.functions import lit

data = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000)
  ]
schema = ["employee_name", "department", "salary"]
df=spark.createDataFrame(data,schema)
df.select("employee_name", "department", "salary",lit("Salary").alias("Test_Col"))\
# .show()

DataFrame[employee_name: string, department: string, salary: bigint, Test_Col: string]

In [21]:
"""
Learn about dataset
"""

from pyspark.sql import SparkSession
from pyspark.sql import Row

# Step 1: Create a Spark session
spark = SparkSession.builder.appName("DatasetExample").getOrCreate()

# Step 2: Define a case class (or schema) to represent the data
class Person:
    def __init__(self, name, age):
        self.name = name
        self.age = age

# Step 3: Create a sequence of data
data = [Person("Alice", 25), Person("Bob", 30), Person("Charlie", 22)]

# Step 4: Convert the data to RDD of Rows
rdd = spark.sparkContext.parallelize(data).map(lambda x: Row(name=x.name, age=x.age))

# Step 5: Create a DataFrame from the RDD
df = spark.createDataFrame(rdd)

# Step 6: Convert the DataFrame to a Dataset using as operator
ds = df.as[Person]

# Step 7: Show the content of the Dataset
ds.show()


SyntaxError: invalid syntax (2996426486.py, line 27)

In [26]:
"""
what will happen if you insert wrong data into defined schema ?
"""
# from pyspark.sql import St

data = [("James", "Sales", 3000),
    ("Michael", "Sales", 4600),
    ("Robert", "Sales", 4100),
    ("Maria", "Finance", 3000),
    ("James", "Sales", 1000)
  ]
schema =StructType([
    StructField("employee_name",StringType(),True),
    StructField("department",StringType(),True),
    StructField("salary",IntegerType(),False),

])
df = spark.createDataFrame(data=data, schema = schema)
# df.show()

In [31]:
"""
word count program
"""
# from pyspark.sql.functions import split
sentences=[("Remember to handle credentials securely and be cautious about handling sensitive information like usernames and passwords",)]
df=spark.createDataFrame(sentences,['sentence'])
words=(
df.selectExpr("split(sentence,' ') as words")\
  .selectExpr("explode(words) as word") \
  .groupBy("word").count()
)
# df=df.split(' ')
# words.show(truncate=False)
# print(df)

In [46]:
# word count using RDD
sentences=["Remember to handle credentials securely and be cautious about handling sensitive information like usernames and passwords"]
rdd=sc.parallelize(sentences)
rdd.flatMap(lambda line:line.split(" "))\
.map(lambda word:(word,1))\
.reduceByKey(lambda curr,nxt:curr+nxt)\
.sortBy(lambda word:word[0])\
.collect()
# print(rdd)

[('Remember', 1),
 ('about', 1),
 ('and', 2),
 ('be', 1),
 ('cautious', 1),
 ('credentials', 1),
 ('handle', 1),
 ('handling', 1),
 ('information', 1),
 ('like', 1),
 ('passwords', 1),
 ('securely', 1),
 ('sensitive', 1),
 ('to', 1),
 ('usernames', 1)]

In [54]:
sentences=["Remember to handle","credentials securely and be cautious","about handling sensitive information like usernames and passwords"]
rdd=sc.parallelize(sentences)
rdd.map(lambda word:word.split(" "))\
.flatMap(lambda w:w)\
# .collect()

PythonRDD[180] at RDD at PythonRDD.scala:53

In [43]:
# Learning about format()
# df=spark.read.format('csv').option('header',True).load('file:///home/tony/BigData/food_establishment_data.csv')
df=spark.read.option('header',True).csv('file:///home/tony/BigData/food_establishment_data.csv')
df.write.option('header',True).parquet('file:///home/tony/BigData/food_establishment_data.parquet')

In [13]:
# Read parquet files count- 266351
df=spark.read.option("startingFrom",10)\
.option('header',True)\
.parquet('file:///home/tony/BigData/food_establishment_data.parquet')
df.count()

                                                                                

266351

In [31]:
# Working with JSON Files
# df=spark.read.json('file:///home/tony/BigData/Files/data.json')
# rdd=sc.textFile('/home/tony/BigData/Files/data.json')

import pandas as pd
# df2=pd.DataFrame(eval('/home/tony/BigData/Files/data.json'))
data=pd.read_json('/home/tony/BigData/Files/data.json',orient='records')
df2=spark.createDataFrame(data)
df2.show()

[Stage 13:>                                                         (0 + 1) / 1]

+--------+-----+--------+--------+
|Duration|Pulse|Maxpulse|Calories|
+--------+-----+--------+--------+
|      60|  110|     130|   409.1|
|      60|  117|     145|   479.0|
|      60|  103|     135|   340.0|
|      45|  109|     175|   282.4|
|      45|  117|     148|   406.0|
|      60|  102|     127|   300.5|
|      60|  110|     136|   374.0|
|      45|  104|     134|   253.3|
|      30|  109|     133|   195.1|
|      60|   98|     124|   269.0|
|      60|  103|     147|   329.3|
|      60|  100|     120|   250.7|
|      60|  106|     128|   345.3|
|      60|  104|     132|   379.3|
|      60|   98|     123|   275.0|
|      60|   98|     120|   215.2|
|      60|  100|     120|   300.0|
|      45|   90|     112|     NaN|
|      60|  103|     123|   323.0|
|      45|   97|     125|   243.0|
+--------+-----+--------+--------+
only showing top 20 rows



                                                                                

In [23]:
df = spark.read.option("mode", "DROPMALFORMED").json("file:///home/tony/BigData/Files/data10.json")
df.printSchema()

root

