In [1]:
# ปรับแต่งค่าการทำงานของ Spark
from pyspark.sql import SparkSession

spark = SparkSession.\
     builder.\
     appName("Kong Spark").\
     master("spark://spark-master:7077").\
     config("spark.executor.memory", "4000m").\
     config("spark.executor.cores", "2").\
     config("spark.cores.max", "8").\
     getOrCreate()


23/06/25 03:00:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# กำหนดตัวแปร

HIGHDISPERSION_LIST= ['delinq_2yrs','pub_rec','annual_inc','revol_bal','dti'] #เก็บชื่อคอลัมน์
HIGHNULL_LIST = ['dti','revol_util'] #เก็บชื่อคอลัมน์
HIGHOUTLIER = ['loan_amnt','delinq_2yrs','annual_inc']
NULL_LIST = [] #เก็บชื่อคอลัมน์
NULL_PERC = 0.1 #สูงกว่าค่านี้ เป็น High Null ซึ่งอาจใช้วิธีลบทั้ง row ที่มี null ไม่ได้ เพราะอาจทำให้สูญเสียทั้ง row ไปเยอะเกินควร
DISPERSION_PERC = 100 #สูงกว่าค่านี้ (%) เป็น High Dispersion

In [3]:
from pyspark.sql import functions as sparkf
from pyspark.sql.types import *

In [4]:
#! pip install scipy

In [5]:
# เรียกใช้ Module/Library ของ Python

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas options
pd.set_option('display.max_colwidth', 1000, 'display.max_rows', None, 'display.max_columns', None)

# Plotting options
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style='whitegrid')

import warnings
warnings.filterwarnings("ignore")

## 1. Business Understaing
    
    1.1 Problem Statement: ต้องการทราบว่า ลูกหนี้แต่ละรายจะมาจ่ายหนี้ครบตามสัญญาเงินกู้ (Fully-paid) หรือไม่มาจ่ายฯ (Charged-off)
    1.2 Project Objective: การจัดเก็บหนี้ดีขึ้นช่วยเพิ่มรายได้ให้กับกิจการ
    1.3 Task of Data Science: Binary Classification
    1.4 Cleansing Policy: ธุรกิจมี columns ที่แนะนำว่ามีความสัมพันธ์/ส่งผลต่อการชำระหนี้คืนตามสัญญา, ลบได้ทั้ง row หากมี missing ใน columns และแทนที่ได้ตามความเหมาะสม
    1.5 Success Criteria: มี Recall/Sensitivity ไม่น้อยกว่า 0.65 บน Testing set แบบ Hold-out

## 2. Data Understanding
    
    - มี Label เป็น column: loan_status
    - มีขนาดใหญ่ (volume)เกินกว่าเทคโนโลยีปัจจุบัน (Python) จะทำงานได้อย่างมีประสิทธิภาพ จึงต้องใช้ Spark ร่วมด้วย
    - CSV เป็น semi-structural data ที่มี header ซึ่งสามารถนำไปพัฒนาเป็น schema ของ structural data (Spark DataFrame) ได้
    - Data Dict.: https://docs.google.com/spreadsheets/d/1qtZBSJ-JS7S2tGC0W9Yxp992LmrDaAwGcJB419Htbbw/edit#gid=1163295822

In [6]:
#กำหนด columns ที่ธุรกิจให้คำแนะนำฯ ไว้

businessAttrs_df = ["loan_amnt","term","int_rate"\
                                ,"installment","grade","emp_length",\
                           "home_ownership","annual_inc"\
                                ,"verification_status","loan_status",\
                           "purpose","addr_state","dti","delinq_2yrs"\
                                ,"earliest_cr_line",\
                           "open_acc","pub_rec"\
                                ,"revol_bal","revol_util","total_acc","issue_d"]

In [7]:
#! apt-get update

In [8]:
#! apt-get install wget -y

In [9]:
#! wget https://storage.googleapis.com/ntclass/LoanStats_web.csv

In [10]:
# ! wc -l LoanStats_web.csv

In [11]:
# ! head -3 LoanStats_web.csv

In [12]:
# Spark อ่านข้อมูลจาก .csv แล้ว convert เป็น DataFrame

raw_df = spark.read.option('header',True)\
.option("quote", "\"")\
.option('mode','DROPMALFORMED')\
.option('inferSchema',True)\
.csv('LoanStats_web.csv')\
.select(businessAttrs_df)

                                                                                

In [13]:
# Spark นับจำนวน row ใน DataFrame

# raw_df.count()

In [14]:
# raw_df.printSchema()

In [15]:
allRows_count = raw_df.count()

23/06/25 03:01:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

### 2.1 Univariate Analysis

In [16]:
# Spark วิเคราะห์ column แล้วแสดงค่าทางสถิติ 5 ค่า ของแต่ละ column โดยมี Python Pandas ช่วยในการแสดงผล

# raw_df.describe().toPandas().transpose()

#### ผลจากการดู Range ขัอเกต คือ annual_inc มี 0 ด้วย

In [17]:
#### ขยายผลกลุ่ม annual_inc = 0 ต่อไป เพื่อดูว่า ปล่อยกู้ในกลุ่มนี้ ยังงัยบ้าง
# raw_df.filter(sparkf.col('annual_inc')==0).describe().toPandas().transpose()

In [18]:
#### code ก่อนหน้านี้เห็นว่า กลุ่ม annual_inc = 0 มีการปล่อยกู้ด้วยวงเงินสูงสุด 40,000
#### code ในส่วนนี้ เห็นว่า คนที่ได้รับเงินกู้ไป 40,000 มีจำนวน 126 คน ซึ่งมากที่สุดแล้วในกลุ่ม annual_inc = 0
# raw_df.filter(sparkf.col('annual_inc')==0).groupBy('loan_amnt').count()\
# .orderBy('count', ascending=False).show()

In [19]:
# Spark และ Pandas ร่วมกันคำนวณ Coefficient of Variation (CV)

basicStat_pd = raw_df.describe().toPandas().transpose()

header_series = basicStat_pd.iloc[0]

noColBasicStat_pd = basicStat_pd[1:]

noColBasicStat_pd.columns = header_series

basicStat_pd = noColBasicStat_pd

basicStat_pd = basicStat_pd[['count','mean','stddev']].astype('float64')

basicStat_pd = basicStat_pd.assign(CV=lambda x: round((x['stddev']/x['mean'])*100,2))\
.sort_values('CV',ascending=False)

# basicStat_pd

                                                                                

In [20]:
# Spark และ Pandas ร่วมกันกรองออกมาเฉพาะ CV ที่เกิน DISPERSION_PERC

basicStat_pd[basicStat_pd.assign(CV=lambda x: round((x['stddev']/x['mean'])*100,2))\
.sort_values('CV',ascending=False)['CV']>DISPERSION_PERC]

summary,count,mean,stddev,CV
delinq_2yrs,1432440.0,0.301472,0.864319,286.7
pub_rec,1432439.0,0.205392,0.586109,285.36
annual_inc,1432440.0,81034.582933,134183.356967,165.59
revol_bal,1432439.0,16846.678102,23301.014583,138.31


In [21]:
# Spark และ Pandas ได้รับรายชื่อ Col. ที่มี Dispersion เกินกว่าค่า DISPERSION_PERC
HIGHDISPERSION_LIST = basicStat_pd[basicStat_pd.assign(CV=lambda x: round((x['stddev']/x['mean'])*100,2))\
.sort_values('CV',ascending=False)['CV']>DISPERSION_PERC][:].axes[0].values.tolist()

In [22]:
HIGHDISPERSION_LIST

['delinq_2yrs', 'pub_rec', 'annual_inc', 'revol_bal']

In [23]:
# raw_df.count()

In [24]:
#วิเคราะห์ NULL

# basicStat_pd.assign(percentageNull=lambda x: ((allRows_count-x['count'])/allRows_count)*100)\
# .sort_values('percentageNull',ascending=True)

In [25]:
# Spark และ Pandas ได้รับรายชื่อ Col. ที่มี Null เกินกว่าค่า NULL_PERC

HIGHNULL_LIST = basicStat_pd[basicStat_pd.assign(percentageNull=lambda x: ((allRows_count-x['count'])/allRows_count)*100)\
.sort_values('percentageNull',ascending=False)['percentageNull']>NULL_PERC][:].axes[0].values.tolist()

In [26]:
HIGHNULL_LIST

['dti']

In [27]:
# Spark และ Pandas ได้รับรายชื่อ Col. ที่มี Null 
NULL_LIST = basicStat_pd[basicStat_pd.assign(countNull=lambda x: allRows_count-x['count'])\
.sort_values('countNull',ascending=False)['countNull']>0][:].axes[0].values.tolist()

In [28]:
# Spark วิเคราะห์ Imbalance Class

# raw_df.groupBy('loan_status').count().show()

In [29]:
# raw_df.select('loan_status').distinct().show()

In [30]:
# raw_df.printSchema()

In [31]:
# raw_df.withColumn('calculatedDTI',sparkf.col('loan_amnt')/sparkf.col('annual_inc'))\
# .select('loan_amnt','annual_inc','dti','calculatedDTI').groupBy('loan_amnt','annual_inc','dti','calculatedDTI').count()\
# .orderBy('dti', ascending=False).show()

### ค่า dti เชื่อถือได้หรือไม่

In [32]:
# raw_df.withColumn('calculatedDTI',sparkf.col('loan_amnt')/sparkf.col('annual_inc'))\
# .select('loan_amnt','annual_inc','dti','calculatedDTI').filter(sparkf.col('calculatedDTI').isNull()).show()

#### พบว่า มี annual_inc = 0 แต่ได้รับเงินกู้ ซึ่งไม่ควรให้ ML จดจำ pattern แบบนี้ไปใช้**

In [33]:
# selectedCol_raw_pd = raw_df.toPandas()
# import seaborn
# seaborn.boxplot(x = selectedCol_raw_pd['loan_amnt'])

In [34]:
# selectedCol_raw_pd.shape

In [35]:
# raw_df.groupBy('loan_status').count().show()

In [36]:
# raw_df.printSchema()

## Multivariative/Bivariate Ananalysis with Correlation

In [37]:

# ประกาศฟังก์ชั่นที่ใช้ทำ Histogram กับ Boxplot เพื่อวิเคราะห์ distribution (shape), outlier และความสัมพันธ์ (correlation)

def plot_var(arg_df, col_name, full_name, continuous):
    """
    Visualize a variable with and without faceting on the loan status.
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
#     f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20,3), dpi=50)
    
    # Plot without loan status
    if continuous:
        sns.distplot(arg_df.loc[arg_df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(x=arg_df[col_name], order=sorted(arg_df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(full_name)
    ax1.set_ylabel('Count')
    ax1.set_title(full_name)

    # Plot with loan status
    if continuous:
        sns.boxplot(x=col_name, y='loan_status', data=arg_df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(full_name + ' by Loan Status')
    else:
        charge_off_rates = arg_df.groupby(col_name)['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']
        sns.barplot(x=charge_off_rates.index, y=charge_off_rates.values, color='#5975A4', saturation=1, ax=ax2)
        ax2.set_ylabel('Fraction of arg_df Charged-off')
        ax2.set_title('Charge-off Rate by ' + full_name)
    ax2.set_xlabel(full_name)
    
    plt.tight_layout()

In [38]:
# delinq_2yrs (The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years)
# numerical discrete

In [39]:
#delinq_2yrs: string (nullable = true)
# change type withColumn

correctedType_raw_df = raw_df.withColumn('delinq_2yrs',sparkf.col('delinq_2yrs').cast(IntegerType()))

In [40]:
#correctedType_raw_pd = correctedType_raw_df.toPandas()

In [41]:
#plot_var(raw_df.filter(sparkf.col('annual_inc') == 0).toPandas(),'loan_amnt', 'Loan Amount', continuous=True)

In [42]:
#plot_var(raw_df.filter(sparkf.col('annual_inc') != 0).toPandas(),'loan_amnt', 'Loan Amount', continuous=True)

In [43]:
#plot_var(correctedType_raw_pd,'loan_amnt', 'loan_amnt', continuous=True)

In [44]:
#plot_var(correctedType_raw_pd,'delinq_2yrs', 'loan_status', continuous=False)

In [45]:
#plot_var(correctedType_raw_pd,'annual_inc', 'annual_inc', continuous=True)

In [46]:
#correctedType_raw_df.select('delinq_2yrs').describe().show()

In [47]:
#correctedType_raw_df.groupBy('delinq_2yrs').count().orderBy('delinq_2yrs',ascending=False).show(100)

In [48]:
#correctedType_raw_df.describe().toPandas().transpose()

In [49]:
#correctedType_raw_pd.info()

In [50]:
# กำหนดตัวแปร

HIGHDISPERSION_LIST= ['delinq_2yrs','pub_rec','annual_inc','revol_bal','dti'] #เก็บชื่อคอลัมน์
HIGHNULL_LIST = ['dti','revol_util'] #เก็บชื่อคอลัมน์
HIGHOUTLIER = ['loan_amnt','delinq_2yrs','annual_inc']
NULL_LIST = [] #เก็บชื่อคอลัมน์
NULL_PERC = 0.1 #สูงกว่าค่านี้ เป็น High Null ซึ่งอาจใช้วิธีลบทั้ง row ที่มี null ไม่ได้ เพราะอาจทำให้สูญเสียทั้ง row ไปเยอะเกินควร
DISPERSION_PERC = 100 #สูงกว่าค่านี้ (%) เป็น High Dispersion

# 3. Data Preparation

In [51]:
noHighNull_df = correctedType_raw_df.dropna(subset=HIGHNULL_LIST)

In [52]:
#noHighNull_df.describe().toPandas().transpose()

## 1.Dropna

In [53]:
noNull_df = noHighNull_df.dropna()

In [54]:
#noNull_df.describe().toPandas().transpose()

## 2.จัดการ term ลบ months ออก

In [55]:
#noNull_df.groupBy('term').count().show()

In [56]:
#noNull_df.select('term').printSchema()

In [57]:
# noNull_df.withColumn('term',sparkf.when(sparkf.col('term').isNotNull(),sparkf.regexp_replace(sparkf.col('term'),' months','')).otherwise(sparkf.col('term')))\
# .groupBy('term').count().show()

In [58]:
#noNull_df.select('term').printSchema()

### cast type to Interger

In [59]:
# noNull_df.withColumn('term',sparkf.when(sparkf.col('term').isNotNull(),sparkf.regexp_replace(sparkf.col('term'),' months','')).otherwise(sparkf.col('term')))\
# .withColumn('term',sparkf.col('term').cast(IntegerType()))\
# .select('term').printSchema()

## 3.จัดการ int_rate remote '%'

In [60]:
# noNull_df\
# .select('int_rate')\
# .describe().toPandas().transpose()

In [61]:
# noNull_df.withColumn('int_rate',sparkf.regexp_replace(sparkf.col('int_rate'),'%',''))\
# .withColumn('int_rate',sparkf.col('int_rate').cast(FloatType()))\
# .select('int_rate').describe().toPandas().transpose()

## 4.จัดการ emp_length

In [62]:
# noNull_df.withColumn('new_emp_length',sparkf.regexp_replace(sparkf.col('emp_length'),'< 1 year','0'))\
# .withColumn('new_emp_length',sparkf.regexp_replace(sparkf.col('new_emp_length'),'[^0-9]',''))\
# .withColumn('new_emp_length',sparkf.col('new_emp_length').cast(IntegerType()))\
# .select('emp_length','new_emp_length')\
# .groupBy('emp_length','new_emp_length').count().show()

In [63]:
# noNull_df.withColumn('new_emp_length',sparkf.regexp_replace(sparkf.col('emp_length'),'< 1 year','0'))\
# .withColumn('new_emp_length',sparkf.regexp_replace(sparkf.col('new_emp_length'),'[^0-9]',''))\
# .withColumn('new_emp_length',sparkf.col('new_emp_length').cast(IntegerType()))\
# .dropna()\
# .select('emp_length','new_emp_length')\
# .groupBy('emp_length','new_emp_length').count().show()

## 5.loan_amount

In [64]:
# noNull_df\
# .withColumn('loan_amnt',sparkf.col('loan_amnt').cast(FloatType()))\
# .filter(sparkf.col('loan_amnt') < 37000.0)\
# .describe().toPandas().transpose()

## 6.annual_inc

In [65]:
noNull_df\
.withColumn('annual_inc',sparkf.col('annual_inc').cast(FloatType()))\
.filter(sparkf.col('annual_inc') != 0.0)\
.select('annual_inc').describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
annual_inc,1429660,81111.82013504994,134221.72384257164,20.0,6.1E7


In [66]:
#plot_var(noNull_df.toPandas(), 'annual_inc', 'annual_inc', continuous=False)

In [67]:
#plot_var(noNull_df.toPandas(), 'annual_inc', 'annual_inc', continuous=True)

## รวม 2-6

In [68]:
# crunched_df = noNull_df\
# # .withColumn('term',sparkf.when(sparkf.col('term').isNotNull(),sparkf.regexp_replace(sparkf.col('term'),' months','')).otherwise(sparkf.col('term')))\
# # .withColumn('term',sparkf.col('term').cast(IntegerType()))\
# .withColumn('int_rate',sparkf.regexp_replace(sparkf.col('int_rate'),'%',''))\
# .withColumn('int_rate',sparkf.col('int_rate').cast(FloatType()))\
# # .withColumn('new_emp_length',sparkf.regexp_replace(sparkf.col('emp_length'),'< 1 year','0'))\
# # .withColumn('new_emp_length',sparkf.regexp_replace(sparkf.col('new_emp_length'),'[^0-9]',''))\
# # .withColumn('emp_length',sparkf.col('new_emp_length').cast(IntegerType()))\
# .withColumn('loan_amnt',sparkf.col('loan_amnt').cast(FloatType()))\
# .filter(sparkf.col('loan_amnt') < 40000.0)\
# # .withColumn('annual_inc',sparkf.col('annual_inc').cast(FloatType()))\
# # .filter(sparkf.col('annual_inc') != 0.0)\
# .dropna()

In [69]:
# crunched_df = noNull_df\
# .withColumn('int_rate',sparkf.regexp_replace(sparkf.col('int_rate'),'%',''))\
# .withColumn('int_rate',sparkf.col('int_rate').cast(FloatType()))\
# .withColumn('loan_amnt',sparkf.col('loan_amnt').cast(FloatType()))\
# .filter(sparkf.col('loan_amnt') < 40000.0)\
# .dropna()

In [70]:
crunched_df = noNull_df\
.withColumn('term',sparkf.when(sparkf.col('term').isNotNull(),sparkf.regexp_replace(sparkf.col('term'),' months','')).otherwise(sparkf.col('term')))\
.withColumn('term',sparkf.col('term').cast(IntegerType()))\
.withColumn('int_rate',sparkf.regexp_replace(sparkf.col('int_rate'),'%',''))\
.withColumn('int_rate',sparkf.col('int_rate').cast(FloatType()))\
.withColumn('loan_amnt',sparkf.col('loan_amnt').cast(FloatType()))\
.filter(sparkf.col('loan_amnt') < 40000.0)\
.withColumn('annual_inc',sparkf.col('annual_inc').cast(FloatType()))\
.filter(sparkf.col('annual_inc') < 150000.0)\
.dropna()

In [71]:
# crunched_df.select('term','int_rate','emp_length','loan_amnt').describe().toPandas().transpose()

In [72]:
# crunched_df.select('loan_status').describe().show()

In [73]:
# crunched_df.groupBy('loan_status').count().show()

In [74]:
# crunched_df.select('term','int_rate','emp_length','loan_amnt')\
# .filter((sparkf.col('loan_status')=='Fully Paid')|(sparkf.col('loan_status')=='Charged Off'))\
# .describe().show()

In [75]:
# crunched_df.select('term','int_rate','emp_length','loan_amnt','loan_status')\
# .filter((sparkf.col('loan_status')=='Fully Paid')|(sparkf.col('loan_status')=='Charged Off'))\
# .groupBy('loan_status').count().show()

In [76]:
# crunched_df.select('term','int_rate','emp_length','loan_amnt')\
# .filter((sparkf.col('loan_status')=='Fully Paid')|(sparkf.col('loan_status')=='Charged Off'))\
# .count()

In [77]:
abt_df = crunched_df.select('term','int_rate','loan_amnt','annual_inc','loan_status','issue_d')\
.filter((sparkf.col('loan_status')=='Fully Paid')|(sparkf.col('loan_status')=='Charged Off'))\
.withColumn('issue_d',sparkf.regexp_extract(sparkf.col('issue_d'),'\d+',0))\
.withColumn('issue_d',sparkf.col('issue_d').cast(IntegerType()))

In [78]:
abt_df.count()

                                                                                

641102

In [79]:
abt_df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
term,641102,40.965075760175445,9.72162477973465,36,60
int_rate,641102,13.354483016292917,5.179147887996389,5.31,30.99
loan_amnt,641102,13546.667223000397,8394.89122500868,1000.0,39975.0
annual_inc,641102,68034.25419335006,29107.528004284766,0.0,149999.0
loan_status,641102,,,Charged Off,Fully Paid
issue_d,641102,2016.6581448817817,0.771854234797894,2016,2019


In [80]:
# Spark ทำ Hold-out Tesing set

candidate_training_df = abt_df.filter(sparkf.col('issue_d') < 2019)
test_df = abt_df.filter(sparkf.col('issue_d') >= 2019)

In [81]:
candidate_training_df.groupBy('loan_status').count().show()



+-----------+------+
|loan_status| count|
+-----------+------+
| Fully Paid|496863|
|Charged Off|136941|
+-----------+------+



                                                                                

In [82]:
# Spark ทำการสุ่มเลือกข้อมูลเฉพาะจาก training set เพื่อลดจำนวน Fully Paid

FullyPaid_training_df = candidate_training_df.filter(sparkf.col('loan_status') == 'Fully Paid').sample(0.30)

ChargedOff_training_df = candidate_training_df.filter(sparkf.col('loan_status') == 'Charged Off')

FullyPaid_training_num = FullyPaid_training_df.count()

ChargedOff_training_num = ChargedOff_training_df.count()

train_df = FullyPaid_training_df.union(ChargedOff_training_df)

                                                                                

In [83]:
# Spark ทำสรุปจำนวน row ของแต่ละ class ที่มีความ Balance Class มากขึ้น
train_df.groupBy('loan_status').count().show()

                                                                                

+-----------+------+
|loan_status| count|
+-----------+------+
| Fully Paid|148707|
|Charged Off|136941|
+-----------+------+



In [84]:
train_df.count()

                                                                                

285648

In [85]:
test_df.count()

                                                                                

7298

In [86]:
#train_df.groupBy('term').count().show()

In [87]:
#plot_var(train_df.toPandas(), 'term', 'term', continuous=False)

In [88]:
#train_df.select('emp_length').describe().show()

In [89]:
#plot_var(train_df.toPandas(), 'emp_length', 'emp_length', continuous=True)

In [90]:
#plot_var(train_df.toPandas(), 'emp_length', 'emp_length', continuous=False)

In [91]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [92]:
# feture encoding
labelIndexer_func = StringIndexer(inputCol='loan_status',outputCol='label',stringOrderType='alphabetDesc')

In [93]:
#new
# labelIndexer_func.fit(train_df).transform(train_df)\
# .groupBy('loan_status','label').count().show()

In [94]:
# verify only !
# labelIndexer_func.fit(train_df).transform(train_df)\
# .groupBy('loan_status','label').count().show()

In [95]:
#. รวมค่าของแต่ละ. feature ในการ train
featureVec_func = VectorAssembler(inputCols=['term','int_rate','loan_amnt','annual_inc'], outputCol='featureVec')

In [96]:
## verify

#featureVec_func.transform(train_df).show(truncate=False)

In [97]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier

In [98]:
algo_DT = DecisionTreeClassifier(featuresCol='featureVec',labelCol='label')

In [99]:
from pyspark.ml.pipeline import Pipeline

In [100]:
pipeline_DT = Pipeline().setStages([labelIndexer_func,featureVec_func,algo_DT])

In [101]:
model_DT = pipeline_DT.fit(train_df)

                                                                                

### Model Evaluation การวัดและประเมินการทำงานของโมเดล ทดสอบ

In [102]:
# view detail tune model
# must be change loan_status to numerical data

#test_df.describe().show()

In [103]:
# model_DT

In [104]:
# model_DT.stages

In [105]:
# ทำงานและวัดผล
result_DT = model_DT.transform(test_df)

In [106]:
# result_DT.select('prediction').show()

In [107]:
# result_DT.describe().show()

## Feture Engineering

In [108]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [109]:
featureImportances = model_DT.stages[2].featureImportances.toArray()

In [110]:
ExtractFeatureImp(featureImportances, result_DT, "featureVec").head(30)

Unnamed: 0,idx,name,score
1,1,int_rate,0.922277
0,0,term,0.049977
2,2,loan_amnt,0.018756
3,3,annual_inc,0.008991


### Easy Evaluation

In [111]:
#### label == prediction
result_DT.select('loan_status','label','prediction').filter(sparkf.col('label')==sparkf.col('prediction'))\
.groupBy('loan_status','label','prediction').count().show()

                                                                                

+-----------+-----+----------+-----+
|loan_status|label|prediction|count|
+-----------+-----+----------+-----+
|Charged Off|  1.0|       1.0|  317|
| Fully Paid|  0.0|       0.0| 3597|
+-----------+-----+----------+-----+



In [112]:
#### label != prediction
result_DT.select('loan_status','label','prediction').filter(sparkf.col('label')!=sparkf.col('prediction'))\
.groupBy('loan_status','label','prediction').count().show()



+-----------+-----+----------+-----+
|loan_status|label|prediction|count|
+-----------+-----+----------+-----+
|Charged Off|  1.0|       0.0|  162|
| Fully Paid|  0.0|       1.0| 3222|
+-----------+-----+----------+-----+



                                                                                

In [113]:
# test_df.groupBy('loan_status').count().show()

In [114]:
# manual 
# TP = 19241; FN = 7817;
# FP = 11709; TN = 15621;

In [115]:
FN = result_DT.select('loan_status','label','prediction')\
.filter((sparkf.col('label') == 1) & (sparkf.col('prediction') == 0))\
.groupBy('loan_status','label','prediction').count().take(1)[0]['count']
FN

                                                                                

162

In [116]:
FP = result_DT.select('loan_status','label','prediction')\
.filter((sparkf.col('label') == 0) & (sparkf.col('prediction') == 1))\
.groupBy('loan_status','label','prediction').count().take(1)[0]['count']
FP

                                                                                

3222

In [117]:
TP = result_DT.select('loan_status','label','prediction')\
.filter((sparkf.col('label') == 1) & (sparkf.col('prediction') == 1))\
.groupBy('loan_status','label','prediction').count().take(1)[0]['count']
TP

                                                                                

317

In [118]:
TN = result_DT.select('loan_status','label','prediction')\
.filter((sparkf.col('label') == 0) & (sparkf.col('prediction') == 0))\
.groupBy('loan_status','label','prediction').count().take(1)[0]['count']
TN

                                                                                

3597

In [119]:
TPR = TP/(TP+FN)
TPR

0.6617954070981211

In [120]:
TNR = TN/(TN+FP)
TNR

0.5274967003959525

In [121]:
FPR = FP/(TN+FP)
FPR

0.4725032996040475

In [122]:
FNR = FN/(TP+FN)
FNR

0.33820459290187893