<a href="https://colab.research.google.com/github/achmadzano/ClassificationModel_Pyspark/blob/main/classification_BDP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
# spark
spark = SparkSession.builder.getOrCreate()

# load data
df_train = spark.read.option('inferSchema', 'true').csv('PlaneDelayTraining.csv', header=True)
df_test = spark.read.option('inferSchema', 'true').csv('PlaneDelayTesting.csv', header=True)

# preprocessing
df_train = df_train.na.drop()
df_test = df_test.na.drop()

# select feature
df_train = df_train.select('TimeDeparture', 'LengthOfFlight', 'Airline', 'IsDelayed')
df_test = df_test.select('TimeDeparture', 'LengthOfFlight', 'Airline', 'IsDelayed')

# transform

def transform(df):
    df = df.withColumn('Airline', when(df['Airline']=='DL', 0).when(df['Airline']=='FL',1).when(df['Airline']=='EV', 2))
    return df

df_train = transform(df_train)
df_test = transform(df_test)

# normalize
cols = df_train.columns
cols.remove('IsDelayed')

assembler = VectorAssembler(inputCols=cols, outputCol='Features')
scaler = StandardScaler(inputCol='Features', outputCol='ScaledFeatures')

df_train = assembler.transform(df_train)
df_train = scaler.fit(df_train).transform(df_train)

df_test = assembler.transform(df_test)
df_test = scaler.fit(df_test).transform(df_test)

# generate model
model = LogisticRegression(featuresCol='ScaledFeatures', labelCol='IsDelayed').fit(df_train)
prediction = model.transform(df_test)

# evaluate
evaluator = BinaryClassificationEvaluator(labelCol='IsDelayed')
accuracy = round(evaluator.evaluate(prediction)*100, 2)
print(accuracy)