# Mengimport Spark SQL, library untuk machine learning dan membuat session

In [1]:
#mengimport modul yang dibutuhkan
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

#membuat session
appName = "Klasifikasi di Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

# Memuat data dari file

In [2]:
#membuat skema data file kita
flightSchema = StructType([
  StructField("DayofMonth", IntegerType(), False),
  StructField("DayOfWeek", IntegerType(), False),
  StructField("Carrier", StringType(), False),
  StructField("OriginAirportID", IntegerType(), False),
  StructField("DestAirportID", IntegerType(), False),
  StructField("DepDelay", IntegerType(), False),
  StructField("ArrDelay", IntegerType(), False),
])

#membaca file ke DataFrame dg skema yg telah dibuat sblmnya
csv = spark.read.csv('dataset/flights.csv', schema=flightSchema, header=True)
csv.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



# Menyiapkan data

In [3]:
data = csv.select(
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay", ((col("ArrDelay") > 15).cast("Int").alias("Late")))
data.show(3)

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
+----------+---------+---------------+-------------+--------+----+
only showing top 3 rows



# Membagi data training dan testing

In [4]:
#membagi data training dan data testing
dataTerpisahkan = data.randomSplit([0.7, 0.3])
train = dataTerpisahkan[0]
test = dataTerpisahkan[1]
train_rows = train.count()
test_rows = test.count()
print ("Jumlah baris data training:", 
       train_rows, " Jumlah baris data testing:", test_rows)

Jumlah baris data training: 1891341  Jumlah baris data testing: 810877


# Menyiapkan data training

In [5]:
train.show(2)
assembler = VectorAssembler(inputCols = [
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")
trainingDataFinal = assembler.transform(
    train).select(col("features"), col("Late").alias("label"))
trainingDataFinal.show(truncate=False, n=2)

+----------+---------+---------------+-------------+--------+----+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|         1|        1|          10140|        10397|      -4|   0|
|         1|        1|          10140|        10821|       8|   0|
+----------+---------+---------------+-------------+--------+----+
only showing top 2 rows

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|0    |
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
+------------------------------+-----+
only showing top 2 rows



# Mentraining model klasifikasi

In [6]:
#mendefinisikan algoritma klasifikasi kita
algoritmaKlasifikasi = LogisticRegression(
    labelCol="label",featuresCol="features",maxIter=10,regParam=0.3)
#mentraining model algoritma klasifikasi kita
model = algoritmaKlasifikasi.fit(trainingDataFinal)
print ("Model selesai ditraining!")

Model selesai ditraining!


# Menyiapkan data testing

In [7]:
testingDataFinal = assembler.transform(
    test).select(col("features"), col("Late").alias("trueLabel"))
testingDataFinal.show(3)

+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        1|
+--------------------+---------+
only showing top 3 rows



# Mengetes data dg model yang telah kita training

In [8]:
prediksiMentah = model.transform(testingDataFinal)
prediksiFinal = prediksiMentah.select(
    "features", "prediction", "probability", "trueLabel")
prediksiFinal.show(truncate=False, n=3)

+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.8295812710681634,0.17041872893183665]|0        |
|[1.0,1.0,10140.0,11259.0,0.0] |0.0       |[0.8256500234941407,0.17434997650585918]|0        |
|[1.0,1.0,10140.0,11259.0,21.0]|0.0       |[0.7793136048267442,0.2206863951732558] |1        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows



# Mengevaluasi seberapa akurat model klasifikasi kita

In [9]:
prediksiBenar = prediksiFinal.filter(
    prediksiFinal['prediction'] == prediksiFinal['trueLabel']).count()
totalData = prediksiFinal.count()
print("prediksi benar: ", prediksiBenar, ", total data: ", totalData, 
      ", akurasi: ", prediksiBenar/totalData)

prediksi benar:  668630 , total data:  810877 , akurasi:  0.8245763537503222


# Mencoba menggunakan algoritma klasifikasi lain

In [10]:
from pyspark.ml.classification import RandomForestClassifier

algoritmaKlasifikasi = RandomForestClassifier(
    numTrees=3, maxDepth=5, seed=42, labelCol="label",featuresCol="features")
model = algoritmaKlasifikasi.fit(trainingDataFinal)
print ("Model selesai ditraining!")

Model selesai ditraining!


In [11]:
prediksiMentah = model.transform(testingDataFinal)
prediksiFinal = prediksiMentah.select(
    "features", "prediction", "probability", "trueLabel")
prediksiFinal.show(truncate=False, n=3)#menampilkan 3 hasil prediksi data
prediksiBenar = prediksiFinal.filter(
    prediksiFinal['prediction'] == prediksiFinal['trueLabel']).count()
totalData = prediksiFinal.count()
print("prediksi benar: ", prediksiBenar, ", total data: ", 
      totalData, ", akurasi: ", prediksiBenar/totalData)

+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.9499887351533095,0.05001126484669058]|0        |
|[1.0,1.0,10140.0,11259.0,0.0] |0.0       |[0.9499887351533095,0.05001126484669058]|0        |
|[1.0,1.0,10140.0,11259.0,21.0]|1.0       |[0.45300585244219765,0.5469941475578023]|1        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows

prediksi benar:  751035 , total data:  810877 , akurasi:  0.9262008911339205
