In [2]:
# ============================================================================
# PERSIAPAN ENVIRONMENT
# ============================================================================

print("=" * 70)
print("INSTALASI PYSPARK")
print("=" * 70)

# 2.1 Instalasi PySpark
!pip install pyspark -q

print("PySpark berhasil diinstal!")

# 2.2 Import Library dan Inisialisasi Spark Session
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Inisialisasi Spark Session
spark = SparkSession.builder \
    .appName('Advanced MLlib Practice') \
    .master('local[*]') \
    .getOrCreate()

print("\nSpark Session berhasil dibuat")
print(f"Spark Version: {spark.version}")


INSTALASI PYSPARK
PySpark berhasil diinstal!

Spark Session berhasil dibuat
Spark Version: 3.5.1


In [3]:
# ============================================================================
# 3. PRAKTIK 1: LINEAR REGRESSION
# ============================================================================

print("\n" + "=" * 70)
print("PRAKTIK 1: LINEAR REGRESSION")
print("=" * 70)

"""
## 3.1 Penjelasan
Linear Regression digunakan untuk memprediksi nilai target kontinu
berdasarkan fitur input.
"""

# 3.2 Implementasi
# Membuat sample data
data = [(1, 5.0, 20.0), (2, 10.0, 25.0),
        (3, 15.0, 30.0), (4, 20.0, 35.0)]
columns = ['ID', 'Feature', 'Target']
df = spark.createDataFrame(data, columns)

# Menampilkan data
print("\nData Sample:")
df.show()

# Transformasi fitur menjadi vector
assembler = VectorAssembler(inputCols=['Feature'], outputCol='Features')
df_transformed = assembler.transform(df)

# Training model
lr = LinearRegression(featuresCol='Features', labelCol='Target')
model = lr.fit(df_transformed)

# Menampilkan hasil
print(f'\nCoefficients: {model.coefficients[0]:.4f}')
print(f'Intercept: {model.intercept:.4f}')
print(f'RMSE: {model.summary.rootMeanSquaredError:.4f}')

"""
Keterangan:
- VectorAssembler mengubah kolom fitur menjadi format vector yang diperlukan MLlib
- Model dilatih menggunakan metode fit()
- Coefficients menunjukkan pengaruh fitur terhadap target
"""



PRAKTIK 1: LINEAR REGRESSION

Data Sample:
+---+-------+------+
| ID|Feature|Target|
+---+-------+------+
|  1|    5.0|  20.0|
|  2|   10.0|  25.0|
|  3|   15.0|  30.0|
|  4|   20.0|  35.0|
+---+-------+------+


Coefficients: 1.0000
Intercept: 15.0000
RMSE: 0.0000


'\nKeterangan:\n- VectorAssembler mengubah kolom fitur menjadi format vector yang diperlukan MLlib\n- Model dilatih menggunakan metode fit()\n- Coefficients menunjukkan pengaruh fitur terhadap target\n'

In [6]:
# ============================================================================
# 4. PRAKTIK 2: LOGISTIC REGRESSION
# ============================================================================

print("\n" + "=" * 70)
print("PRAKTIK 2: LOGISTIC REGRESSION")
print("=" * 70)

"""
## 4.1 Penjelasan
Logistic Regression digunakan untuk klasifikasi biner, memprediksi
probabilitas suatu instance termasuk dalam kategori tertentu.
"""

# 4.2 Implementasi
# Membuat sample data klasifikasi
# Memisahkan fitur menjadi kolom terpisah (Feature1, Feature2)
data = [(1, 2.0, 3.0, 0), (2, 1.0, 5.0, 1),
        (3, 2.5, 4.5, 1), (4, 3.0, 6.0, 0)]
columns = ['ID', 'Feature1', 'Feature2', 'Label'] # Mengganti rawFeatures dengan Feature1 dan Feature2
df_log = spark.createDataFrame(data, columns)

print("\nData Sample:")
df_log.show()

# Transformasi fitur menjadi vector menggunakan VectorAssembler
assembler_log = VectorAssembler(inputCols=['Feature1', 'Feature2'], outputCol='Features')
df_log_transformed = assembler_log.transform(df_log)

# Training model
lr_classifier = LogisticRegression(featuresCol='Features', labelCol='Label')
log_model = lr_classifier.fit(df_log_transformed)

# Menampilkan hasil
print(f'\nCoefficients: {log_model.coefficients}')
print(f'Intercept: {log_model.intercept:.4f}')

# Prediksi
predictions = log_model.transform(df_log_transformed)
print("\nHasil Prediksi:")
# Menampilkan kolom Feature1, Feature2, Label, prediction, dan probability
predictions.select('Feature1', 'Feature2', 'Label', 'prediction', 'probability').show()

"""
Keterangan:
- Model menghasilkan probabilitas untuk setiap kelas
- Prediction adalah kelas dengan probabilitas tertinggi
- Coefficients menunjukkan kontribusi setiap fitur terhadap klasifikasi
"""


PRAKTIK 2: LOGISTIC REGRESSION

Data Sample:
+---+--------+--------+-----+
| ID|Feature1|Feature2|Label|
+---+--------+--------+-----+
|  1|     2.0|     3.0|    0|
|  2|     1.0|     5.0|    1|
|  3|     2.5|     4.5|    1|
|  4|     3.0|     6.0|    0|
+---+--------+--------+-----+


Coefficients: [-12.262057929180484,4.087352266486688]
Intercept: 11.5689

Hasil Prediksi:
+--------+--------+-----+----------+--------------------+
|Feature1|Feature2|Label|prediction|         probability|
+--------+--------+-----+----------+--------------------+
|     2.0|     3.0|    0|       0.0|[0.66666647815282...|
|     1.0|     5.0|    1|       1.0|[2.66352305802435...|
|     2.5|     4.5|    1|       0.0|[0.66666660367734...|
|     3.0|     6.0|    0|       0.0|[0.66666672920183...|
+--------+--------+-----+----------+--------------------+



'\nKeterangan:\n- Model menghasilkan probabilitas untuk setiap kelas\n- Prediction adalah kelas dengan probabilitas tertinggi\n- Coefficients menunjukkan kontribusi setiap fitur terhadap klasifikasi\n'

In [8]:
# ============================================================================
# 5. PRAKTIK 3: KMEANS CLUSTERING
# ============================================================================

print("\n" + "=" * 70)
print("PRAKTIK 3: KMEANS CLUSTERING")
print("=" * 70)

"""
## 5.1 Penjelasan
KMeans adalah algoritma clustering unsupervised yang mengelompokkan
data berdasarkan kesamaan fitur.
"""

# 5.2 Implementasi
# Membuat sample data clustering
# Memisahkan fitur menjadi kolom terpisah (Feature1, Feature2)
data = [(1, 1.0, 1.0), (2, 5.0, 5.0),
        (3, 10.0, 10.0), (4, 15.0, 15.0)]
columns = ['ID', 'Feature1', 'Feature2'] # Mengganti Features dengan Feature1 dan Feature2
df_cluster = spark.createDataFrame(data, columns)

print("\nData Sample:")
df_cluster.show()

# Transformasi fitur menjadi vector menggunakan VectorAssembler
assembler_cluster = VectorAssembler(inputCols=['Feature1', 'Feature2'], outputCol='Features')
df_cluster_transformed = assembler_cluster.transform(df_cluster)

# Training KMeans model
kmeans = KMeans(featuresCol='Features', k=2, seed=42)
kmeans_model = kmeans.fit(df_cluster_transformed)

# Menampilkan cluster centers
print("\nCluster Centers:")
centers = kmeans_model.clusterCenters()
for i, center in enumerate(centers):
    print(f'Cluster {i} Center: {center}')

# Prediksi cluster
predictions = kmeans_model.transform(df_cluster_transformed)
print("\nHasil Clustering:")
predictions.show()

"""
Keterangan:
- Parameter k=2 menentukan jumlah cluster yang dibentuk
- ClusterCenters menunjukkan titik pusat setiap cluster
- Setiap data point diberi label cluster berdasarkan jarak terdekat ke center
"""


PRAKTIK 3: KMEANS CLUSTERING

Data Sample:
+---+--------+--------+
| ID|Feature1|Feature2|
+---+--------+--------+
|  1|     1.0|     1.0|
|  2|     5.0|     5.0|
|  3|    10.0|    10.0|
|  4|    15.0|    15.0|
+---+--------+--------+


Cluster Centers:
Cluster 0 Center: [12.5 12.5]
Cluster 1 Center: [3. 3.]

Hasil Clustering:
+---+--------+--------+-----------+----------+
| ID|Feature1|Feature2|   Features|prediction|
+---+--------+--------+-----------+----------+
|  1|     1.0|     1.0|  [1.0,1.0]|         1|
|  2|     5.0|     5.0|  [5.0,5.0]|         1|
|  3|    10.0|    10.0|[10.0,10.0]|         0|
|  4|    15.0|    15.0|[15.0,15.0]|         0|
+---+--------+--------+-----------+----------+



'\nKeterangan:\n- Parameter k=2 menentukan jumlah cluster yang dibentuk\n- ClusterCenters menunjukkan titik pusat setiap cluster\n- Setiap data point diberi label cluster berdasarkan jarak terdekat ke center\n'

In [9]:
# ============================================================================
# 6. HOMEWORK: KLASIFIKASI DENGAN TITANIC DATASET
# ============================================================================

print("\n" + "=" * 70)
print("HOMEWORK: KLASIFIKASI DENGAN TITANIC DATASET")
print("=" * 70)

# 6.1 Download Dataset dari Kaggle (Metode Google Colab)
print("\n[INFO] Silakan upload file Titanic-Dataset.csv menggunakan:")
print("1. Menu Files di sidebar kiri Google Colab, atau")
print("2. Gunakan kode di bawah untuk upload manual")

# Uncomment salah satu metode berikut:

# METODE 1: Upload Manual
from google.colab import files
print("\n[INFO] Memulai upload file...")
uploaded = files.upload()


# Set file path
file_path = 'Titanic-Dataset.csv'

# 6.1 Load dan Eksplorasi Data
print("\n[STEP 1] Load dan Eksplorasi Data")
print("-" * 70)

df_titanic = spark.read.csv(file_path, header=True, inferSchema=True)

# Menampilkan schema dan sample data
print("\nSchema Dataset:")
df_titanic.printSchema()

print("\nSample Data (5 baris pertama):")
df_titanic.show(5)

print("\nStatistik Deskriptif:")
df_titanic.describe().show()

print(f"\nJumlah total data: {df_titanic.count()}")

"""
Keterangan:
Dataset Titanic berisi informasi penumpang dan label survival
(1=survived, 0=died).
"""

# 6.2 Data Preprocessing
print("\n[STEP 2] Data Preprocessing")
print("-" * 70)

# Pilih kolom yang relevan dan hapus missing values
df_clean = df_titanic.select('Survived', 'Pclass', 'Sex',
                             'Age', 'SibSp', 'Parch', 'Fare') \
                     .na.drop()

print(f"\nJumlah data setelah cleaning: {df_clean.count()}")

# Encode kolom Sex (categorical to numeric)
indexer = StringIndexer(inputCol='Sex', outputCol='SexIndex')
df_indexed = indexer.fit(df_clean).transform(df_clean)

# Kombinasikan fitur menjadi vector
feature_cols = ['Pclass', 'SexIndex', 'Age', 'SibSp', 'Parch', 'Fare']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_final = assembler.transform(df_indexed)

# Split data training dan testing
train_data, test_data = df_final.randomSplit([0.8, 0.2], seed=42)

print(f"Training data count: {train_data.count()}")
print(f"Test data count: {test_data.count()}")

"""
Keterangan:
- Missing values dihapus untuk menyederhanakan preprocessing
- StringIndexer mengubah 'male'/'female' menjadi nilai numerik
- Data dibagi 80% training dan 20% testing
"""

# 6.3 Training Model
print("\n[STEP 3] Training Model")
print("-" * 70)

# Training Logistic Regression model
lr = LogisticRegression(featuresCol='features', labelCol='Survived',
                       maxIter=10)
lr_model = lr.fit(train_data)

# Prediksi pada test data
predictions = lr_model.transform(test_data)
print("\nSample Prediksi (10 baris pertama):")
predictions.select('Survived', 'prediction', 'probability').show(10)

"""
Keterangan:
Model dilatih dengan maksimal 10 iterasi untuk konvergensi.
"""

# 6.4 Evaluasi Model
print("\n[STEP 4] Evaluasi Model")
print("-" * 70)

# Evaluasi akurasi
evaluator = MulticlassClassificationEvaluator(
    labelCol='Survived', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

# Evaluasi precision dan recall
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol='Survived', predictionCol='prediction', metricName='weightedPrecision')
recall_evaluator = MulticlassClassificationEvaluator(
    labelCol='Survived', predictionCol='prediction', metricName='weightedRecall')

precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)

print("\n=== HASIL EVALUASI MODEL BASELINE ===")
print(f'Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)')
print(f'Precision: {precision:.4f} ({precision*100:.2f}%)')
print(f'Recall:    {recall:.4f} ({recall*100:.2f}%)')

"""
Keterangan:
- Accuracy: persentase prediksi yang benar
- Precision: proporsi prediksi positif yang benar
- Recall: proporsi actual positif yang terdeteksi
"""

# 6.5 Hyperparameter Tuning dengan Cross-Validation
print("\n[STEP 5] Hyperparameter Tuning dengan Cross-Validation")
print("-" * 70)
print("\n[INFO] Proses ini membutuhkan waktu beberapa menit...")

# Buat parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5]) \
    .addGrid(lr.maxIter, [10, 20]) \
    .build()

print(f"\nTotal kombinasi parameter yang akan diuji: {len(paramGrid)}")

# Setup cross-validator
crossval = CrossValidator(estimator=lr,
                         estimatorParamMaps=paramGrid,
                         evaluator=evaluator,
                         numFolds=3)

# Training dengan cross-validation
cv_model = crossval.fit(train_data)

# Evaluasi model terbaik
cv_predictions = cv_model.transform(test_data)
cv_accuracy = evaluator.evaluate(cv_predictions)

print("\n=== HASIL EVALUASI SETELAH HYPERPARAMETER TUNING ===")
print(f'Cross-Validation Accuracy: {cv_accuracy:.4f} ({cv_accuracy*100:.2f}%)')
print(f'\nPeringkatan Akurasi: {(cv_accuracy - accuracy)*100:.2f}%')

print('\n=== BEST MODEL PARAMETERS ===')
print(f'RegParam: {cv_model.bestModel._java_obj.getRegParam()}')
print(f'ElasticNetParam: {cv_model.bestModel._java_obj.getElasticNetParam()}')
print(f'MaxIter: {cv_model.bestModel._java_obj.getMaxIter()}')

"""
Keterangan:
- ParamGridBuilder membuat kombinasi parameter untuk diuji
- CrossValidator mengevaluasi setiap kombinasi dengan 3-fold cross-validation
- Model terbaik dipilih berdasarkan akurasi tertinggi
"""


HOMEWORK: KLASIFIKASI DENGAN TITANIC DATASET

[INFO] Silakan upload file Titanic-Dataset.csv menggunakan:
1. Menu Files di sidebar kiri Google Colab, atau
2. Gunakan kode di bawah untuk upload manual

[INFO] Memulai upload file...


Saving Titanic-Dataset.csv to Titanic-Dataset.csv

[STEP 1] Load dan Eksplorasi Data
----------------------------------------------------------------------

Schema Dataset:
root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)


Sample Data (5 baris pertama):
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+---

'\nKeterangan:\n- ParamGridBuilder membuat kombinasi parameter untuk diuji\n- CrossValidator mengevaluasi setiap kombinasi dengan 3-fold cross-validation\n- Model terbaik dipilih berdasarkan akurasi tertinggi\n'