In [1]:
# DataFrame sederhana dan operasi dasar

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan3').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Jamilah', 'Accounting', 3900),
        ('Michel', 'Accounting', 4600),
        ('Assep', 'Enggineer', 5000),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/10 17:05:01 WARN Utils: Your hostname, arsa-IdeaPad-3-14ITL6, resolves to a loopback address: 127.0.1.1; using 192.168.1.206 instead (on interface wlp0s20f3)
25/09/10 17:05:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/10 17:05:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 1) / 1]

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       James|     Sales|  3000|
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
|     Jamilah|Accounting|  3900|
|      Michel|Accounting|  4600|
|       Assep| Enggineer|  5000|
|       Maria|   Finance|  3000|
+------------+----------+------+



                                                                                

In [2]:
# Operasi dasar DataFrame di PySpark

# 1. Filter data karyawan dengan gaji di atas 3000 lalu tampilkan kolom nama dan gajinya
df.filter(df['Salary'] > 3000).select('EmployeeName', 'Salary').show()

# 2. Menghitung rata-rata gaji per departemen
df.groupBy('Department').avg('Salary').withColumnRenamed("avg(Salary)", "RataRataGaji").show()

# 3. Menghitung gaji maksimum per departemen
df.groupBy('Department').max('Salary').withColumnRenamed("max(Salary)", "GajiTertinggi").show()

# 4. Menghitung gaji minimum per departemen
df.groupBy('Department').min('Salary').withColumnRenamed("min(Salary)", "GajiTerendah").show()

# 5. Menghitung total gaji per departemen
df.groupBy('Department').sum('Salary').withColumnRenamed("sum(Salary)", "TotalGaji").show()

# 6. Menghitung jumlah karyawan per departemen
df.groupBy('Department').count().withColumnRenamed("count", "JumlahKaryawan").show()


+------------+------+
|EmployeeName|Salary|
+------------+------+
|     Michael|  4600|
|      Robert|  4100|
|     Jamilah|  3900|
|      Michel|  4600|
|       Assep|  5000|
+------------+------+

+----------+------------+
|Department|RataRataGaji|
+----------+------------+
|     Sales|      3900.0|
|Accounting|      4250.0|
|   Finance|      3000.0|
| Enggineer|      5000.0|
+----------+------------+

+----------+-------------+
|Department|GajiTertinggi|
+----------+-------------+
|     Sales|         4600|
|Accounting|         4600|
|   Finance|         3000|
| Enggineer|         5000|
+----------+-------------+

+----------+------------+
|Department|GajiTerendah|
+----------+------------+
|     Sales|        3000|
|Accounting|        3900|
|   Finance|        3000|
| Enggineer|        5000|
+----------+------------+

+----------+---------+
|Department|TotalGaji|
+----------+---------+
|     Sales|    11700|
|Accounting|     8500|
|   Finance|     3000|
| Enggineer|     5000|
+----

In [3]:
from pyspark.sql import functions as F

# 1. Membuat kolom baru yang berisi gaji setelah dipotong pajak 5%
df.withColumn('SalaryAfterTax', df['Salary'] * 0.95).show()

# 2. Menambahkan kolom status berdasarkan kondisi gaji
df.withColumn('Status', F.when(df['Salary'] > 4000, 'High').otherwise('Normal')).show()

+------------+----------+------+--------------+
|EmployeeName|Department|Salary|SalaryAfterTax|
+------------+----------+------+--------------+
|       James|     Sales|  3000|        2850.0|
|     Michael|     Sales|  4600|        4370.0|
|      Robert|     Sales|  4100|        3895.0|
|     Jamilah|Accounting|  3900|        3705.0|
|      Michel|Accounting|  4600|        4370.0|
|       Assep| Enggineer|  5000|        4750.0|
|       Maria|   Finance|  3000|        2850.0|
+------------+----------+------+--------------+

+------------+----------+------+------+
|EmployeeName|Department|Salary|Status|
+------------+----------+------+------+
|       James|     Sales|  3000|Normal|
|     Michael|     Sales|  4600|  High|
|      Robert|     Sales|  4100|  High|
|     Jamilah|Accounting|  3900|Normal|
|      Michel|Accounting|  4600|  High|
|       Assep| Enggineer|  5000|  High|
|       Maria|   Finance|  3000|Normal|
+------------+----------+------+------+



In [4]:
# Penggunaan window functions

from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+----------+------+----+
|EmployeeName|Department|Salary|Rank|
+------------+----------+------+----+
|     Jamilah|Accounting|  3900|   1|
|      Michel|Accounting|  4600|   2|
|       Assep| Enggineer|  5000|   1|
|       Maria|   Finance|  3000|   1|
|       James|     Sales|  3000|   1|
|      Robert|     Sales|  4100|   2|
|     Michael|     Sales|  4600|   3|
+------------+----------+------+----+



In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1. Buat SparkSession
spark = SparkSession.builder.appName("StressLevelAnalysis").getOrCreate()

# 2. Load dataset
df = spark.read.csv("StressLevelDataset.csv", header=True, inferSchema=True)

# 3. Tampilkan hanya beberapa kolom penting
df.select("anxiety_level", "depression", "sleep_quality", "stress_level") \
  .show(10, truncate=False)

# 4. Filter: data dengan stress_level > 3 (tampilkan kolom penting saja)
df.filter(df["stress_level"] > 3) \
  .select("anxiety_level", "depression", "sleep_quality", "stress_level") \
  .show(10, truncate=False)

# 5. Agregasi: rata-rata stress_level berdasarkan sleep_quality
df.groupBy("sleep_quality") \
  .agg(F.avg("stress_level").alias("AvgStress")) \
  .orderBy("sleep_quality") \
  .show()

# 6. Window function: ranking stress_level dalam tiap sleep_quality
windowSpec = Window.partitionBy("sleep_quality").orderBy(df["stress_level"].desc())
df.withColumn("Rank", F.rank().over(windowSpec)) \
  .select("sleep_quality", "anxiety_level", "depression", "stress_level", "Rank") \
  .show(10, truncate=False)


+-------------+----------+-------------+------------+
|anxiety_level|depression|sleep_quality|stress_level|
+-------------+----------+-------------+------------+
|14           |11        |2            |1           |
|15           |15        |1            |2           |
|12           |14        |2            |1           |
|16           |15        |1            |2           |
|16           |7         |5            |1           |
|20           |21        |1            |2           |
|4            |6         |4            |0           |
|17           |22        |1            |2           |
|13           |12        |2            |1           |
|6            |27        |1            |1           |
+-------------+----------+-------------+------------+
only showing top 10 rows
+-------------+----------+-------------+------------+
|anxiety_level|depression|sleep_quality|stress_level|
+-------------+----------+-------------+------------+
+-------------+----------+-------------+------------+

+-