<a href="https://colab.research.google.com/github/asih1725/BIGDATA/blob/main/Bigdata3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import SparkSession
from pyspark.sql import SparkSession

# Membuat Spark Session
spark = SparkSession.builder.appName("HandsOnPertemuan3").getOrCreate()

# Data dengan nama baru
data = [
    ("anaya", "Sales", 3000),
    ("naya", "Sales", 4600),
    ("kanaya", "Sales", 4100),
    ("sakti", "Finance", 3000),
    ("meilia", "Finance", 3900),
    ("lia", "Marketing", 3000),
    ("aya", "Marketing", 2000)
]

# Kolom
columns = ["EmployeeName", "Department", "Salary"]

# Membuat DataFrame
df = spark.createDataFrame(data, schema=columns)

# Menampilkan isi DataFrame
print("=== DataFrame Awal ===")
df.show()

# Menampilkan struktur DataFrame
print("=== Struktur DataFrame ===")
df.printSchema()

# Menampilkan hanya kolom tertentu
print("=== Select EmployeeName dan Salary ===")
df.select("EmployeeName", "Salary").show()

# Filter data dengan kondisi (Salary > 3000)
print("=== Filter Salary > 3000 ===")
df.filter(df["Salary"] > 3000).show()

# Statistik deskriptif
print("=== Statistik DataFrame ===")
df.describe().show()


=== DataFrame Awal ===
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       anaya|     Sales|  3000|
|        naya|     Sales|  4600|
|      kanaya|     Sales|  4100|
|       sakti|   Finance|  3000|
|      meilia|   Finance|  3900|
|         lia| Marketing|  3000|
|         aya| Marketing|  2000|
+------------+----------+------+

=== Struktur DataFrame ===
root
 |-- EmployeeName: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

=== Select EmployeeName dan Salary ===
+------------+------+
|EmployeeName|Salary|
+------------+------+
|       anaya|  3000|
|        naya|  4600|
|      kanaya|  4100|
|       sakti|  3000|
|      meilia|  3900|
|         lia|  3000|
|         aya|  2000|
+------------+------+

=== Filter Salary > 3000 ===
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|        naya|     Sales|  4600|
|      kanaya| 

In [2]:
# Import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Membuat Spark Session
spark = SparkSession.builder.appName("HandsOnPertemuan3").getOrCreate()

# Data dengan nama baru
data = [
    ("anaya", "Sales", 3000),
    ("naya", "Sales", 4600),
    ("kanaya", "Sales", 4100),
    ("sakti", "Finance", 3000),
    ("meilia", "Finance", 3900),
    ("lia", "Marketing", 3000),
    ("aya", "Marketing", 2000)
]

# Kolom
columns = ["EmployeeName", "Department", "Salary"]

# Membuat DataFrame
df = spark.createDataFrame(data, schema=columns)

print("=== DataFrame Awal ===")
df.show()

# 1. SELECT → pilih kolom tertentu
print("=== Pilih EmployeeName dan Salary ===")
df.select("EmployeeName", "Salary").show()

# 2. FILTER → karyawan dengan gaji > 3000
print("=== Karyawan dengan Salary > 3000 ===")
df.filter(df["Salary"] > 3000).show()

# 3. GROUPBY + AGGREGATION → rata-rata gaji tiap departemen
print("=== Rata-rata Gaji per Department ===")
df.groupBy("Department").agg(F.mean("Salary").alias("AvgSalary")).show()

# 4. GROUPBY + MAX → gaji tertinggi tiap departemen
print("=== Gaji Tertinggi per Department ===")
df.groupBy("Department").agg(F.max("Salary").alias("MaxSalary")).show()

# 5. GROUPBY + SUM → total gaji tiap departemen
print("=== Total Gaji per Department ===")
df.groupBy("Department").agg(F.sum("Salary").alias("TotalSalary")).show()


=== DataFrame Awal ===
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       anaya|     Sales|  3000|
|        naya|     Sales|  4600|
|      kanaya|     Sales|  4100|
|       sakti|   Finance|  3000|
|      meilia|   Finance|  3900|
|         lia| Marketing|  3000|
|         aya| Marketing|  2000|
+------------+----------+------+

=== Pilih EmployeeName dan Salary ===
+------------+------+
|EmployeeName|Salary|
+------------+------+
|       anaya|  3000|
|        naya|  4600|
|      kanaya|  4100|
|       sakti|  3000|
|      meilia|  3900|
|         lia|  3000|
|         aya|  2000|
+------------+------+

=== Karyawan dengan Salary > 3000 ===
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|        naya|     Sales|  4600|
|      kanaya|     Sales|  4100|
|      meilia|   Finance|  3900|
+------------+----------+------+

=== Rata-rata Gaji per Department ===
+----------+---------+
|

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Membuat Spark Session
spark = SparkSession.builder.appName("HandsOnPertemuan3").getOrCreate()

# Data dengan tipe data kompleks (Array & Map)
data = [
    ("anaya", "Sales", 3000, ["Python", "SQL"], {"Bonus": 500, "Allowance": 200}),
    ("naya", "Sales", 4600, ["Java", "Scala"], {"Bonus": 600, "Allowance": 300}),
    ("kanaya", "Sales", 4100, ["Python", "R"], {"Bonus": 550, "Allowance": 250}),
    ("sakti", "Finance", 3000, ["Excel", "SQL"], {"Bonus": 400, "Allowance": 150}),
    ("meilia", "Finance", 3900, ["Python", "PowerBI"], {"Bonus": 450, "Allowance": 200}),
    ("lia", "Marketing", 3000, ["SEO", "Content"], {"Bonus": 300, "Allowance": 100}),
    ("aya", "Marketing", 2000, ["Design", "Canva"], {"Bonus": 250, "Allowance": 100})
]

columns = ["EmployeeName", "Department", "Salary", "Skills", "Compensation"]

# Membuat DataFrame
df = spark.createDataFrame(data, schema=columns)

print("=== Data dengan Tipe Data Kompleks ===")
df.show(truncate=False)

# 1. Akses elemen array → ambil skill pertama
print("=== Ambil Skill Pertama ===")
df.withColumn("FirstSkill", F.col("Skills")[0]).show(truncate=False)

# 2. Explode array → pecah setiap skill menjadi baris terpisah
print("=== Explode Skills ===")
df.select("EmployeeName", F.explode("Skills").alias("Skill")).show(truncate=False)

# 3. Akses nilai dari Map → ambil Bonus
print("=== Ambil Bonus dari Compensation ===")
df.withColumn("Bonus", F.col("Compensation")["Bonus"]).show(truncate=False)

# 4. Hitung Total Compensation (Salary + Bonus + Allowance)
print("=== Total Compensation ===")
df.withColumn("TotalCompensation",
              F.col("Salary") + F.col("Compensation")["Bonus"] + F.col("Compensation")["Allowance"]
             ).show(truncate=False)


=== Data dengan Tipe Data Kompleks ===
+------------+----------+------+-----------------+--------------------------------+
|EmployeeName|Department|Salary|Skills           |Compensation                    |
+------------+----------+------+-----------------+--------------------------------+
|anaya       |Sales     |3000  |[Python, SQL]    |{Allowance -> 200, Bonus -> 500}|
|naya        |Sales     |4600  |[Java, Scala]    |{Allowance -> 300, Bonus -> 600}|
|kanaya      |Sales     |4100  |[Python, R]      |{Allowance -> 250, Bonus -> 550}|
|sakti       |Finance   |3000  |[Excel, SQL]     |{Allowance -> 150, Bonus -> 400}|
|meilia      |Finance   |3900  |[Python, PowerBI]|{Allowance -> 200, Bonus -> 450}|
|lia         |Marketing |3000  |[SEO, Content]   |{Allowance -> 100, Bonus -> 300}|
|aya         |Marketing |2000  |[Design, Canva]  |{Allowance -> 100, Bonus -> 250}|
+------------+----------+------+-----------------+--------------------------------+

=== Ambil Skill Pertama ===
+-------

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Membuat Spark Session
spark = SparkSession.builder.appName("HandsOnPertemuan3").getOrCreate()

# Data sederhana
data = [
    ("anaya", "Sales", 3000),
    ("naya", "Sales", 4600),
    ("kanaya", "Sales", 4100),
    ("sakti", "Finance", 3000),
    ("meilia", "Finance", 3900),
    ("lia", "Marketing", 3000),
    ("aya", "Marketing", 2000)
]

columns = ["EmployeeName", "Department", "Salary"]

df = spark.createDataFrame(data, schema=columns)

print("=== DataFrame Awal ===")
df.show()

# WindowSpec: berdasarkan Department, urut berdasarkan Salary
windowSpec = Window.partitionBy("Department").orderBy("Salary")

# 1. Ranking dalam tiap departemen
print("=== Ranking Salary dalam tiap Department ===")
df.withColumn("Rank", F.rank().over(windowSpec)).show()

# 2. Row Number dalam tiap departemen
print("=== Row Number Salary dalam tiap Department ===")
df.withColumn("RowNumber", F.row_number().over(windowSpec)).show()

# 3. Running Total Gaji dalam tiap departemen
print("=== Running Total Salary per Department ===")
df.withColumn("RunningTotal", F.sum("Salary").over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow))).show()

# 4. Rata-rata Salary dalam tiap departemen (Window Function)
print("=== Rata-rata Salary (Window) ===")
df.withColumn("AvgSalary", F.avg("Salary").over(windowSpec)).show()


=== DataFrame Awal ===
+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       anaya|     Sales|  3000|
|        naya|     Sales|  4600|
|      kanaya|     Sales|  4100|
|       sakti|   Finance|  3000|
|      meilia|   Finance|  3900|
|         lia| Marketing|  3000|
|         aya| Marketing|  2000|
+------------+----------+------+

=== Ranking Salary dalam tiap Department ===
+------------+----------+------+----+
|EmployeeName|Department|Salary|Rank|
+------------+----------+------+----+
|       sakti|   Finance|  3000|   1|
|      meilia|   Finance|  3900|   2|
|         aya| Marketing|  2000|   1|
|         lia| Marketing|  3000|   2|
|       anaya|     Sales|  3000|   1|
|      kanaya|     Sales|  4100|   2|
|        naya|     Sales|  4600|   3|
+------------+----------+------+----+

=== Row Number Salary dalam tiap Department ===
+------------+----------+------+---------+
|EmployeeName|Department|Salary|RowNumber|
+------------+-

In [10]:
from google.colab import files
import zipfile
import os

# Upload file zip
uploaded = files.upload()

# Ekstrak ke folder dataset
zip_path = "/content/archive (1).zip"   # sesuaikan kalau namanya berbeda
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/dataset")

# Cek isi folder dataset
os.listdir("/content/dataset")


Saving archive (1).zip to archive (1) (1).zip


['apple_quality.csv']

In [12]:
import os
os.listdir("/content/dataset")


['apple_quality.csv']

In [13]:
from pyspark.sql import SparkSession

# Inisialisasi Spark
spark = SparkSession.builder.appName("Tugas5_AppleQuality").getOrCreate()

# Load dataset
df = spark.read.csv("/content/dataset/apple_quality.csv", header=True, inferSchema=True)

# Lihat 5 data pertama
df.show(5)
df.printSchema()


+----+------------+------------+------------+------------+-----------+------------+------------+-------+
|A_id|        Size|      Weight|   Sweetness| Crunchiness|  Juiciness|    Ripeness|     Acidity|Quality|
+----+------------+------------+------------+------------+-----------+------------+------------+-------+
|   0|-3.970048523|-2.512336381| 5.346329613|-1.012008712|1.844900361| 0.329839797|-0.491590483|   good|
|   1|-1.195217191|-2.839256528| 3.664058758| 1.588232309|0.853285795| 0.867530082|-0.722809367|   good|
|   2|-0.292023862|-1.351281995|-1.738429162|-0.342615928|2.838635512|-0.038033328| 2.621636473|    bad|
|   3|-0.657195773|-2.271626609| 1.324873847|-0.097874716|3.637970491|-3.413761338| 0.790723217|   good|
|   4|  1.36421682|-1.296611877|-0.384658206| -0.55300577|3.030874354|-1.303849429| 0.501984036|   good|
+----+------------+------------+------------+------------+-----------+------------+------------+-------+
only showing top 5 rows

root
 |-- A_id: integer (nulla

In [14]:
df.select("A_id", "Size", "Weight", "Quality").show(5)


+----+------------+------------+-------+
|A_id|        Size|      Weight|Quality|
+----+------------+------------+-------+
|   0|-3.970048523|-2.512336381|   good|
|   1|-1.195217191|-2.839256528|   good|
|   2|-0.292023862|-1.351281995|    bad|
|   3|-0.657195773|-2.271626609|   good|
|   4|  1.36421682|-1.296611877|   good|
+----+------------+------------+-------+
only showing top 5 rows



In [15]:
# Contoh: ambil apel dengan Weight > 150
df.filter(df["Weight"] > 150).show(5)


+----+----+------+---------+-----------+---------+--------+-------+-------+
|A_id|Size|Weight|Sweetness|Crunchiness|Juiciness|Ripeness|Acidity|Quality|
+----+----+------+---------+-----------+---------+--------+-------+-------+
+----+----+------+---------+-----------+---------+--------+-------+-------+



In [16]:
from pyspark.sql import functions as F

# Hitung rata-rata ukuran dan berat per kualitas
df.groupBy("Quality").agg(
    F.mean("Size").alias("Rata2_Size"),
    F.mean("Weight").alias("Rata2_Weight"),
    F.count("*").alias("Jumlah_Data")
).show()


+-------+--------------------+-------------------+-----------+
|Quality|          Rata2_Size|       Rata2_Weight|Jumlah_Data|
+-------+--------------------+-------------------+-----------+
|   NULL|                NULL|               NULL|          1|
|    bad|  -0.974357725011523|-0.9918278738471935|       1996|
|   good|-0.03355314380439122|-0.9872743224446121|       2004|
+-------+--------------------+-------------------+-----------+



In [17]:
df.write.csv("/content/output_apple_quality", header=True, mode="overwrite")
