In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as pfs

spark = SparkSession.builder.appName("CreditScoringAnalysis").getOrCreate()

In [None]:
df = spark.read.csv('credit_scoring.csv',
                    sep=",",
                    header=True,
                    quote='"',
                    inferSchema=True,)
df.show(5)

In [None]:
df = df.drop('kode_kontrak')

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

kpr_encoder = StringIndexer(inputCol='kpr_aktif', outputCol='kpr_encode').fit(df)
df = kpr_encoder.transform(df)

In [None]:
rro_encoder = StringIndexer(inputCol='rata_rata_overdue', outputCol='rro').fit(df)
df = rro_encoder.transform(df)

In [None]:
df = df.drop('kpr_aktif','rata_rata_overdue')

In [None]:
requires = ['pendapatan_setahun_juta', 'durasi_pinjaman_bulan', 'jumlah_tanggungan', 'risk_rating', 'kpr_encode', 'rro']

vec_asm = VectorAssembler(inputCols=requires, outputCol='features')
df = vec_asm.transform(df)

In [None]:
training, testing = df.randomSplit([0.75, 0.25])

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier(featuresCol='features', labelCol='risk_rating')
dtc = dtc.fit(training)

y_predict = dtc.transform(testing)

In [None]:
y_pred = y_predict.na.drop()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_eval = MulticlassClassificationEvaluator(labelCol='risk_rating', metricName='accuracy')
print("DTC model score using pyspark:", multi_eval.evaluate(y_pred))