In [1]:
! rm -rf LoanStats_web*

In [2]:
! wget https://storage.googleapis.com/grizzy-lab/LoanStats_web.csv

--2022-11-03 16:02:36--  https://storage.googleapis.com/grizzy-lab/LoanStats_web.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.126.128, 74.125.70.128, 74.125.132.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.126.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1160243241 (1.1G) [text/csv]
Saving to: ‘LoanStats_web.csv’


2022-11-03 16:03:00 (46.4 MB/s) - ‘LoanStats_web.csv’ saved [1160243241/1160243241]



In [3]:
! hdfs dfs -put LoanStats_web.csv /

put: `/LoanStats_web.csv': File exists


In [4]:
import time as t

In [5]:
! pip install pandas

[0m

In [6]:
from pyspark.sql import functions as F

In [7]:
spark

In [8]:
raw_LendingClubWeb_df = spark.read.format('csv').\
option('header','true').option('mode','DROPMALFORMED')\
.load('/LoanStats_web.csv')

                                                                                

# 2. Data Understanding

In [9]:
## Get row count
raw_LendingClubWeb_df.count()

                                                                                

1432466

In [10]:
## Get Data Type
raw_LendingClubWeb_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: string (nullable = true)
 |-- funded_amnt: string (nullable = true)
 |-- funded_amnt_inv: string (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: string (nullable = true)
 |-- installment: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- pymnt_plan: string (nullable = true)
 |-- url: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string 

In [11]:
## Get Freq. Distribution of "purpose" (อาจเจอ outlier)
raw_LendingClubWeb_df.groupBy('grade').count().orderBy('count').show(100)



+-----+------+
|grade| count|
+-----+------+
| null|    26|
|    G|  5271|
|    F| 19480|
|    E| 68353|
|    D|191137|
|    A|302988|
|    C|421101|
|    B|424110|
+-----+------+



                                                                                

In [12]:
ALL = raw_LendingClubWeb_df.count()

                                                                                

In [13]:
ALL

1432466

In [14]:
## Get Freq. Distribution of "purpose" (อาจเจอ outlier)
raw_LendingClubWeb_df.groupBy('grade').count()\
.withColumnRenamed('count','gradeCount')\
.withColumn('gradePercentage',(F.col('gradeCount')/ALL)*100)\
.orderBy('gradePercentage').show(100)



+-----+----------+--------------------+
|grade|gradeCount|     gradePercentage|
+-----+----------+--------------------+
| null|        26|0.001815051805767...|
|    G|      5271|  0.3679668487768645|
|    F|     19480|  1.3598926606285944|
|    E|     68353|   4.771701387676916|
|    D|    191137|   13.34321373072729|
|    A|    302988|  21.151496789452594|
|    C|    421101|   29.39692809462842|
|    B|    424110|   29.60698543630355|
+-----+----------+--------------------+



                                                                                

In [15]:
## Check for Imbalance Class
raw_LendingClubWeb_df.select(['loan_status']).groupBy('loan_status').count().orderBy('count').show(100)



+------------------+------+
|       loan_status| count|
+------------------+------+
|              null|    26|
|           Default|  1419|
| Late (16-30 days)|  4986|
|   In Grace Period|  6136|
|Late (31-120 days)| 19455|
|       Charged Off|152999|
|        Fully Paid|583755|
|           Current|663690|
+------------------+------+



                                                                                

In [16]:
## Get data that contain ONLY Fully Paid and Charged Off
loanPayment_df = raw_LendingClubWeb_df\
.filter((F.col('loan_status') == 'Fully Paid') | ((F.col('loan_status') =='Charged Off')))

In [17]:
loanPayment_df.groupBy('loan_status').count().show()



+-----------+------+
|loan_status| count|
+-----------+------+
| Fully Paid|583755|
|Charged Off|152999|
+-----------+------+



                                                                                

In [18]:
## Get data that contain ONLY related attributes with Business Prespective.
business_df = loanPayment_df.select('annual_inc'\
                                           ,'bc_util'\
                                           ,'inq_fi'\
                                           ,'inq_last_12m'\
                                           ,'home_ownership'\
                                           ,'purpose'\
                                           ,'emp_length'\
                                           ,'revol_bal'\
                                           ,'dti'\
                                           ,'delinq_2yrs'\
                                           ,'pub_rec_bankruptcies'\
                                           ,'pub_rec'\
                                           ,'open_rv_24m'\
                                           ,'mort_acc'\
                                           ,'num_actv_bc_tl'\
                                           ,'num_actv_rev_tl'\
                                           ,'num_il_tl'\
                                           ,'num_tl_90g_dpd_24m'\
                                           ,'int_rate'\
                                           ,'inq_last_6mths'\
                                           ,'term'\
                                           ,'installment'\
                                           ,'total_rev_hi_lim'\
                                           ,'total_bal_il'\
                                           ,'total_bal_ex_mort'\
                                           ,'total_acc'\
                                           ,'tot_cur_bal'\
                                           ,'loan_amnt'\
                                           ,'loan_status'\
                                           ,'verification_status'\
                                           ,'collections_12_mths_ex_med'\
                                           ,'chargeoff_within_12_mths'\
                                           ,'il_util'\
                                           ,'last_pymnt_amnt'\
                                    ,'addr_state'\
                                           #,'last_pymnt_d'\
                                           ,'out_prncp_inv'\
                                           ,'out_prncp'\
                                           ,'total_pymnt_inv'\
                                           ,'total_pymnt'\
                                           ,'grade')
                                           #,'sec_app_collections_12_mths_ex_med'\
                                           #,'sec_app_chargeoff_within_12_mths'\
                                           #,'settlement_term'\
                                           #,'settlement_amount'\
                                           #,'settlement_status'

In [19]:
len(business_df.columns)

40

In [20]:
## Get Missing, Dispersion and Range (Validity)
import pandas as pd
business_df.describe().toPandas().transpose()

22/11/03 16:03:45 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
annual_inc,736754,79498.49264390829,78094.7518701319,0,99999.84
bc_util,727864,55.109908993987574,28.90309532109856,0,99.9
inq_fi,736698,1.0650497218670336,5.890278008272024,0,9
inq_last_12m,736697,2.2510679424512383,21.116503762814474,0,9
home_ownership,736754,,,ANY,RENT
purpose,736754,,,and also pay off some credit card debt. I wi...,wedding
emp_length,736754,,,1 year,
revol_bal,736753,16101.816257280256,22819.366258659797,0,99991
dti,736282,18.711086881231356,13.79426027607045,-1,Debt consolidation


### สรุปการดำเนินงานของ Data Understanding: เลือกเฉพาะบาง Attributes และ Row ที่เป็น Charge Off/Fully Charge

# 3. Data Preparation

In [21]:
## Get Data with No Null
no_null_df = business_df.dropna(how='any')

In [22]:
### from pyspark.sql.functions import *

In [23]:
## Get Data with No "wedding"
no_wedding_df = no_null_df.filter(F.col('purpose') != 'wedding')

In [24]:
fitmem_no_null_df = no_wedding_df.repartition(60)

In [25]:
cached_no_null_df = fitmem_no_null_df.cache()

In [26]:
## นับจำนวนข้อมูลหลัง Cleansing ค่า 'Null'
cached_no_null_df.count()

                                                                                

627709

### Remove '%' และ Extract Month

In [27]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

### Remove '%' 

In [28]:
def f_removepercent(origin):
    return origin.rstrip('%')

In [29]:
removepercent = udf(lambda x: f_removepercent(x),StringType())

### Extract Month

In [30]:
def f_exrtractmonth(origin):
    return origin.split('-')[0]

In [31]:
exrtractmonth = udf(lambda x: f_exrtractmonth(x),StringType())

### Replace'n/a' ใน Field 'emp_length' ด้วย 'Notemployed'

In [32]:
def python_treatNA(origin):
    if origin == 'n/a':
        new = 'NotEmployed'
    else:
        new = origin
    return new

In [33]:
treatNA = udf(lambda x: python_treatNA(x),StringType())

### เปลี่ยนแปลง Type ของข้อมูล

In [34]:
from pyspark.sql.functions import col

In [35]:
crunched_df = cached_no_null_df.\
withColumn('emp_length',treatNA(cached_no_null_df['emp_length'])).\
withColumn('int_rate',removepercent(cached_no_null_df['int_rate']).cast(DoubleType())).\
withColumn('dti',cached_no_null_df['dti'].cast(DoubleType())).\
withColumn('revol_bal',cached_no_null_df['revol_bal'].cast(DoubleType())).\
withColumn('pub_rec',cached_no_null_df['pub_rec'].cast(DoubleType())).\
withColumn('total_bal_il',cached_no_null_df['total_bal_il'].cast(DoubleType())).\
withColumn('tot_cur_bal',cached_no_null_df['tot_cur_bal'].cast(DoubleType())).\
withColumn('total_acc',cached_no_null_df['total_acc'].cast(DoubleType())).\
withColumn('total_bal_ex_mort',cached_no_null_df['total_bal_ex_mort'].cast(DoubleType())).\
withColumn('total_rev_hi_lim',cached_no_null_df['total_rev_hi_lim'].cast(DoubleType())).\
withColumn('num_actv_rev_tl',cached_no_null_df['num_actv_rev_tl'].cast(DoubleType())).\
withColumn('num_actv_bc_tl',cached_no_null_df['num_actv_bc_tl'].cast(DoubleType())).\
withColumn('num_il_tl',cached_no_null_df['num_il_tl'].cast(DoubleType())).\
withColumn('pub_rec_bankruptcies',cached_no_null_df['pub_rec_bankruptcies'].cast(DoubleType())).\
withColumn('delinq_2yrs',cached_no_null_df['delinq_2yrs'].cast(DoubleType())).\
withColumn('open_rv_24m',cached_no_null_df['open_rv_24m'].cast(DoubleType())).\
withColumn('num_tl_90g_dpd_24m',cached_no_null_df['num_tl_90g_dpd_24m'].cast(DoubleType())).\
withColumn('inq_last_6mths',cached_no_null_df['inq_last_6mths'].cast(DoubleType())).\
withColumn('bc_util',cached_no_null_df['bc_util'].cast(DoubleType())).\
withColumn('mort_acc',cached_no_null_df['mort_acc'].cast(DoubleType())).\
withColumn('inq_fi',cached_no_null_df['inq_fi'].cast(DoubleType())).\
withColumn('last_pymnt_amnt',cached_no_null_df['last_pymnt_amnt'].cast(DoubleType())).\
withColumn('out_prncp_inv',cached_no_null_df['out_prncp_inv'].cast(DoubleType())).\
withColumn('out_prncp',cached_no_null_df['out_prncp'].cast(DoubleType())).\
withColumn('total_pymnt_inv',cached_no_null_df['total_pymnt_inv'].cast(DoubleType())).\
withColumn('total_pymnt',cached_no_null_df['total_pymnt'].cast(DoubleType())).\
withColumn('il_util',cached_no_null_df['il_util'].cast(DoubleType())).\
withColumn('chargeoff_within_12_mths',cached_no_null_df['chargeoff_within_12_mths'].cast(DoubleType())).\
withColumn('collections_12_mths_ex_med',cached_no_null_df['collections_12_mths_ex_med'].cast(DoubleType())).\
withColumn('loan_amnt',cached_no_null_df['loan_amnt'].cast(DoubleType())).\
withColumn('inq_last_12m',cached_no_null_df['inq_last_12m'].cast(DoubleType())).\
withColumn('installment',cached_no_null_df['installment'].cast(DoubleType())).\
withColumn('annual_inc',cached_no_null_df['annual_inc'].cast(DoubleType()))

In [36]:
crunched_df.printSchema()

root
 |-- annual_inc: double (nullable = true)
 |-- bc_util: double (nullable = true)
 |-- inq_fi: double (nullable = true)
 |-- inq_last_12m: double (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- revol_bal: double (nullable = true)
 |-- dti: double (nullable = true)
 |-- delinq_2yrs: double (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- open_rv_24m: double (nullable = true)
 |-- mort_acc: double (nullable = true)
 |-- num_actv_bc_tl: double (nullable = true)
 |-- num_actv_rev_tl: double (nullable = true)
 |-- num_il_tl: double (nullable = true)
 |-- num_tl_90g_dpd_24m: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- term: string (nullable = true)
 |-- installment: double (nullable = true)
 |-- total_rev_hi_lim: double (nullable = true)
 |-- total_bal_

In [37]:
## ทดลองความถูกต้องของ 'emp_length' หลังจาก Replacement แล้ว
crunched_df.select(crunched_df['emp_length']).distinct().show()

                                                                                

+-----------+
| emp_length|
+-----------+
|    5 years|
|    9 years|
|     1 year|
|NotEmployed|
|    2 years|
|    7 years|
|    8 years|
|    4 years|
|    6 years|
|    3 years|
|  10+ years|
|   < 1 year|
+-----------+



In [38]:
crunched_df.select(crunched_df['purpose']).distinct().show()

+------------------+
|           purpose|
+------------------+
|             other|
|    small_business|
|debt_consolidation|
|       credit_card|
|            moving|
|          vacation|
|  renewable_energy|
|             house|
|               car|
|    major_purchase|
|           medical|
|  home_improvement|
+------------------+



### Normalization 'annual_inc'

In [39]:
from pyspark.sql.functions import *
max_annual_inc = crunched_df.select(max('annual_inc')).collect()[0][0]
min_annual_inc = crunched_df.select(min('annual_inc')).collect()[0][0]

In [40]:
def t_annual_inc(origin):
    return ((origin-min_annual_inc)/(max_annual_inc-min_annual_inc))

In [41]:
n_annual_inc = udf(lambda x: t_annual_inc(x),DoubleType())

### Normalization 'revol_bal'

In [42]:
max_revol_bal = crunched_df.select(max('revol_bal')).collect()[0][0]
min_revol_bal = crunched_df.select(min('revol_bal')).collect()[0][0]

In [43]:
def t_revol_bal(origin):
    return ((origin-min_revol_bal)/(max_revol_bal-min_revol_bal))

In [44]:
n_revol_bal = udf(lambda x: t_revol_bal(x),DoubleType())

### Normalization 'tot_cur_bal'

In [45]:
max_tot_cur_bal = crunched_df.select(max('tot_cur_bal')).collect()[0][0]
min_tot_cur_bal = crunched_df.select(min('tot_cur_bal')).collect()[0][0]

In [46]:
def t_tot_cur_bal(origin):
    return ((origin-min_tot_cur_bal)/(max_tot_cur_bal-min_tot_cur_bal))

In [47]:
n_tot_cur_bal = udf(lambda x: t_tot_cur_bal(x),DoubleType())

### Normalization 'total_rev_hi_lim'

In [48]:
max_total_rev_hi_lim = crunched_df.select(max('total_rev_hi_lim')).collect()[0][0]
min_total_rev_hi_lim = crunched_df.select(min('total_rev_hi_lim')).collect()[0][0]

In [49]:
def t_total_rev_hi_lim(origin):
    return ((origin-min_total_rev_hi_lim)/(max_total_rev_hi_lim-min_total_rev_hi_lim))

In [50]:
n_total_rev_hi_lim = udf(lambda x: t_total_rev_hi_lim(x),DoubleType())

### Normalization 'total_bal_ex_mort'

In [51]:
max_total_bal_ex_mort = crunched_df.select(max('total_bal_ex_mort')).collect()[0][0]
min_total_bal_ex_mort = crunched_df.select(min('total_bal_ex_mort')).collect()[0][0]

In [52]:
def t_total_bal_ex_mort(origin):
    return ((origin-min_total_bal_ex_mort)/(max_total_bal_ex_mort-min_total_bal_ex_mort))

In [53]:
n_total_bal_ex_mort = udf(lambda x: t_total_bal_ex_mort(x),DoubleType())

### Normalization 'total_bal_il'

In [54]:
max_total_bal_il = crunched_df.select(max('total_bal_il')).collect()[0][0]
min_total_bal_il = crunched_df.select(min('total_bal_il')).collect()[0][0]

In [55]:
def t_total_bal_il(origin):
    return ((origin-min_total_bal_il)/(max_total_bal_il-min_total_bal_il))

In [56]:
n_total_bal_il = udf(lambda x: t_total_bal_il(x),DoubleType())

In [57]:
## Call functions of data normalization
normalized_df = crunched_df.\
withColumn('annual_inc',n_annual_inc(crunched_df['annual_inc'])).\
withColumn('revol_bal',n_revol_bal(crunched_df['revol_bal'])).\
withColumn('tot_cur_bal',n_tot_cur_bal(crunched_df['tot_cur_bal'])).\
withColumn('total_rev_hi_lim',n_total_rev_hi_lim(crunched_df['total_rev_hi_lim'])).\
withColumn('total_bal_il',n_total_bal_il(crunched_df['total_bal_il'])).\
withColumn('total_bal_ex_mort',n_total_bal_ex_mort(crunched_df['total_bal_ex_mort']))

#withColumn('loan_amnt',n_loan_amt(crunched_df['loan_amnt'])).\

In [58]:
normalized_df.count()

627709

In [59]:
normalized_filtered_df = normalized_df

In [60]:
## ตรวจดู Null, Dispersion และ Range (Validity)
normalized_df.describe().toPandas().transpose()

                                                                                

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
annual_inc,627709,0.008462566145329772,0.00827420862199892,0.0,1.0
bc_util,627709,55.70084688924322,28.83168566962161,0.0,252.3
inq_fi,627709,1.1189500230202212,1.5859378645745752,0.0,48.0
inq_last_12m,627709,2.3041584555900902,2.5282249368970446,0.0,67.0
home_ownership,627709,,,ANY,RENT
purpose,627709,,,car,vacation
emp_length,627709,,,1 year,NotEmployed
revol_bal,627709,0.014265257680731206,0.01983763318111703,0.0,1.0
dti,627709,19.848822623221892,13.332388424437125,-1.0,999.0


### กำจัดค่า 'Null' จากการทำ Data Prep

In [61]:
data_no_missing_df = normalized_filtered_df.dropna(how='any')

In [62]:
data_no_missing_df.count()

                                                                                

627709

In [63]:
data_no_missing_df.printSchema()

root
 |-- annual_inc: double (nullable = true)
 |-- bc_util: double (nullable = true)
 |-- inq_fi: double (nullable = true)
 |-- inq_last_12m: double (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- revol_bal: double (nullable = true)
 |-- dti: double (nullable = true)
 |-- delinq_2yrs: double (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- pub_rec: double (nullable = true)
 |-- open_rv_24m: double (nullable = true)
 |-- mort_acc: double (nullable = true)
 |-- num_actv_bc_tl: double (nullable = true)
 |-- num_actv_rev_tl: double (nullable = true)
 |-- num_il_tl: double (nullable = true)
 |-- num_tl_90g_dpd_24m: double (nullable = true)
 |-- int_rate: double (nullable = true)
 |-- inq_last_6mths: double (nullable = true)
 |-- term: string (nullable = true)
 |-- installment: double (nullable = true)
 |-- total_rev_hi_lim: double (nullable = true)
 |-- total_bal_

# 4. Modeling

### แบ่งข้อมูลเป็น Training และ Testing

In [66]:
data_no_missing_df_fully_paid = normalized_df.filter(col('loan_status') == 'Fully Paid').sample(True, 0.3, 42)

In [67]:
data_no_missing_df_charge_off = normalized_df.filter(col('loan_status') == 'Charged Off')

In [68]:
final_data_no_missing_df = data_no_missing_df_fully_paid.union(data_no_missing_df_charge_off)

In [69]:
training_dt , test_dt = final_data_no_missing_df.filter(col('loan_amnt') > 0).randomSplit([0.8,0.2])

In [70]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.ml.feature import StringIndexer, VectorAssembler, \
OneHotEncoder, VectorIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, \
BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, GBTClassifier, \
NaiveBayes, RandomForestClassifier, DecisionTreeClassifier
from pyspark.ml import Pipeline
from pyspark.ml.clustering import *

### Feature Transformation - Convert from Categorical Vars to Numerical ones

In [71]:
labelIndexer = StringIndexer(inputCol='loan_status',outputCol='indexedLabel')

In [72]:
gradeIndexer = StringIndexer(inputCol='grade',outputCol='gradeIndexed')
gradeOneHotEncoder = OneHotEncoder(dropLast=False,inputCol='gradeIndexed',\
                                  outputCol='gradeVec')

In [73]:
homeIndexer = StringIndexer(inputCol='home_ownership',outputCol='homeIndexed')
homeOneHotEncoder = OneHotEncoder(dropLast=False,inputCol='homeIndexed',\
                                  outputCol='homeVec')

In [74]:
purposeIndexer = StringIndexer(inputCol='purpose',outputCol='purposeIndexed')
purposeOneHotEncoder = OneHotEncoder(dropLast=False,inputCol='purposeIndexed',\
                                  outputCol='purposeVec')

In [75]:
emp_lengthIndexer = StringIndexer(inputCol='emp_length',outputCol='emp_lengthIndexed')
emp_lengthOneHotEncoder = OneHotEncoder(dropLast=False,inputCol='emp_lengthIndexed',\
                                  outputCol='emp_lengthVec')

In [76]:
verification_statusIndexer = StringIndexer(inputCol='verification_status',outputCol='verification_statusIndexed')
verification_statusOneHotEncoder = OneHotEncoder(dropLast=False,inputCol='verification_statusIndexed',\
                                  outputCol='verification_statusVec')

In [77]:
addr_stateIndexer = StringIndexer(inputCol='addr_state',outputCol='addr_stateIndexed')
addr_stateOneHotEncoder = OneHotEncoder(dropLast=False,inputCol='addr_stateIndexed',\
                                  outputCol='addr_stateVec')

### [FINAL] Feature Selection 

In [78]:
featureAssembler = VectorAssembler(inputCols=['int_rate'\
                                              ,'total_acc'\
                                              #,'inq_fi'\
                                              #,'inq_last_12m'\
                                              #,'home_ownership'\
                                              #,'purpose'\
                                              #,'emp_length'\
                                              ,'installment'\
                                              #,'total_rev_hi_lim'\
                                              #,'loan_amnt'\
                                              #,'loan_status'\
                                              #,'verification_status'\
                                              #,'total_pymnt'\
                                              ,'gradeVec'\
                                              ,'homeVec'\
                                              ,'addr_stateVec'
                                              #,'emp_lengthVec'\
                                              #,'purposeVec'\
                                              ,'verification_statusVec']\
                                   ,outputCol='***features')

### Training with DecisionTree

In [79]:
dt = DecisionTreeClassifier(featuresCol='***features',labelCol='indexedLabel')

In [80]:
pipeline_dt = Pipeline().setStages([gradeIndexer,gradeOneHotEncoder,\
                                    homeIndexer,homeOneHotEncoder,\
                                    emp_lengthIndexer,emp_lengthOneHotEncoder,\
                                    purposeIndexer,purposeOneHotEncoder,\
                                    verification_statusIndexer,verification_statusOneHotEncoder,\
                                    addr_stateIndexer, addr_stateOneHotEncoder,\
                                    labelIndexer,\
                                    featureAssembler,\
                                    dt])

In [81]:
## Check for Imbalance Class
training_dt.groupBy('loan_status').count().orderBy(['loan_status']).show()



+-----------+------+
|loan_status| count|
+-----------+------+
|Charged Off|105050|
| Fully Paid|119139|
+-----------+------+



                                                                                

In [82]:
training_dt.count()

                                                                                

224189

In [83]:
start_time_dt = t.time()

In [84]:
model_dt = pipeline_dt.fit(training_dt)

                                                                                

In [85]:
duration_dt = t.time() - start_time_dt

In [86]:
print('ใช้เวลา Distributed Training {0} วินาที'.format(duration_dt))

ใช้เวลา Distributed Training 32.095402240753174 วินาที


In [87]:
spark_fullLabel_func = udf(lambda x: "Fully Paid" if x == 0.0 else "Charged Off")

In [88]:
result_dt = model_dt.transform(test_dt.dropna())\
.withColumn('full_prediction',spark_fullLabel_func(F.col('prediction')))

In [89]:
## Check for Imbalance Class
result_dt.groupBy(['loan_status','indexedLabel']).count().show()

                                                                                

+-----------+------------+-----+
|loan_status|indexedLabel|count|
+-----------+------------+-----+
| Fully Paid|         0.0|29799|
|Charged Off|         1.0|26206|
+-----------+------------+-----+



In [90]:
result_dt.count()

                                                                                

56005

In [91]:
NUM_ALL_TESTING = result_dt.count()

                                                                                

### การประเมิน Charge-Off: TPR, FNR (Recall, Miss rate) 

In [92]:
result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|  

                                                                                

In [93]:
NUM_ACTUAL_POSITIVE = result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [94]:
NUM_ACTUAL_POSITIVE

26206

In [95]:
#ตรวจสอบความถูกต้องจากการ Predict ตรงกับข้อมูลที่เป็น IndexLabel หรือไม่ (เนื่องจากมีการทำ indexer ค่าที่ได้จึงเป็น Numerical)

### Positive = Charged Off

In [96]:
result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|  

                                                                                

In [97]:
FN = result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [98]:
(FN/NUM_ACTUAL_POSITIVE)*100

38.66290162558193

In [99]:
result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|  

                                                                                

In [100]:
TP = result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [101]:
(TP/NUM_ACTUAL_POSITIVE)*100

61.337098374418076

### การประเมิน Fully Paid: TNR, FPR (Specificity, Fall-out) 

In [102]:
result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|  

In [103]:
NUM_ACTUAL_NEGATIVE = result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [104]:
NUM_ACTUAL_NEGATIVE

29799

In [105]:
NUM_ACTUAL_POSITIVE

26206

### Negative = Fully Paid

In [106]:
result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|  

In [107]:
FP = result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [108]:
(FP/NUM_ACTUAL_NEGATIVE)*100

33.28635189100305

In [109]:
result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|  

In [110]:
TN = result_dt.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [111]:
(TN/NUM_ACTUAL_NEGATIVE)*100

66.71364810899695

### การประเมิน Accuracy

In [112]:
NUM_ACTUAL_NEGATIVE + NUM_ACTUAL_POSITIVE

56005

In [113]:
(TN+TP)/(NUM_ACTUAL_NEGATIVE + NUM_ACTUAL_POSITIVE)

0.6419783947861798

In [114]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [115]:
evaluator_DT = MulticlassClassificationEvaluator(predictionCol="prediction",\
                                              labelCol='indexedLabel', metricName='accuracy')
evaluator_DT.evaluate(result_dt)*100

                                                                                

64.19783947861798

In [116]:
#model_dt.write().overwrite().save('gs://grizzy-lab/loanpayment_dtModel')

### ------------- สิ้นสุด Decision Tree ---------------------------

### Training with RandomForest

In [117]:
RF = RandomForestClassifier(featuresCol='***features',labelCol='indexedLabel')

In [118]:
pipeline_RF = Pipeline().setStages([gradeIndexer,gradeOneHotEncoder,\
                                    homeIndexer,homeOneHotEncoder,\
                                    emp_lengthIndexer,emp_lengthOneHotEncoder,\
                                    purposeIndexer,purposeOneHotEncoder,\
                                    verification_statusIndexer,verification_statusOneHotEncoder,\
                                    addr_stateIndexer, addr_stateOneHotEncoder,\
                                    labelIndexer,\
                                    featureAssembler,\
                                    RF])

In [119]:
#training_RF, test_RF = normalized_df.randomSplit([0.6,0.4])

In [120]:
training_RF = training_dt

In [121]:
test_RF = test_dt

In [122]:
training_RF.groupBy('loan_status').count().orderBy(['loan_status']).show()



+-----------+------+
|loan_status| count|
+-----------+------+
|Charged Off|105050|
| Fully Paid|119139|
+-----------+------+



                                                                                

In [123]:
training_RF.count()

                                                                                

224189

In [124]:
start_time_rf = t.time()

In [125]:
model_RF = pipeline_RF.fit(training_RF)

                                                                                

In [126]:
duration_rf = t.time() - start_time_rf

In [127]:
print('ใช้เวลา Distributed Training {0} วินาที'.format(duration_rf))

ใช้เวลา Distributed Training 24.415870666503906 วินาที


In [128]:
result_RF = model_RF.transform(test_RF.dropna()).withColumn('full_prediction',spark_fullLabel_func(F.col('prediction')))

In [129]:
result_RF.groupBy('loan_status').count().show()



+-----------+-----+
|loan_status|count|
+-----------+-----+
| Fully Paid|29799|
|Charged Off|26206|
+-----------+-----+



                                                                                

In [130]:
result_RF.count()

                                                                                

56005

In [131]:
result_RF

DataFrame[annual_inc: double, bc_util: double, inq_fi: double, inq_last_12m: double, home_ownership: string, purpose: string, emp_length: string, revol_bal: double, dti: double, delinq_2yrs: double, pub_rec_bankruptcies: double, pub_rec: double, open_rv_24m: double, mort_acc: double, num_actv_bc_tl: double, num_actv_rev_tl: double, num_il_tl: double, num_tl_90g_dpd_24m: double, int_rate: double, inq_last_6mths: double, term: string, installment: double, total_rev_hi_lim: double, total_bal_il: double, total_bal_ex_mort: double, total_acc: double, tot_cur_bal: double, loan_amnt: double, loan_status: string, verification_status: string, collections_12_mths_ex_med: double, chargeoff_within_12_mths: double, il_util: double, last_pymnt_amnt: double, addr_state: string, out_prncp_inv: double, out_prncp: double, total_pymnt_inv: double, total_pymnt: double, grade: string, gradeIndexed: double, gradeVec: vector, homeIndexed: double, homeVec: vector, emp_lengthIndexed: double, emp_lengthVec: vec

### การประเมิน Charge-Off: TPR, FNR (Recall, Miss rate) 

In [132]:
result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|  

                                                                                

In [133]:
NUM_ACTUAL_POSITIVE = result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [134]:
NUM_ACTUAL_POSITIVE

26206

In [135]:
#ตรวจสอบความถูกต้องจากการ Predict ตรงกับข้อมูลที่เป็น IndexLabel หรือไม่ (เนื่องจากมีการทำ indexer ค่าที่ได้จึงเป็น Numerical)

### Positive = Charged Off

In [136]:
result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|  

                                                                                

In [137]:
FN = result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [138]:
FN

8668

In [139]:
(FN/NUM_ACTUAL_POSITIVE)*100

33.07639471876669

In [140]:
result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|  

                                                                                

In [141]:
TP = result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [142]:
(TP/NUM_ACTUAL_POSITIVE)*100

66.9236052812333

### การประเมิน Fully Paid: TNR, FPR (Specificity, Fall-out) 

In [143]:
result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|  

In [144]:
NUM_ACTUAL_NEGATIVE = result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [145]:
NUM_ACTUAL_NEGATIVE

29799

In [146]:
#ตรวจสอบความถูกต้องจากการ Predict ตรงกับข้อมูลที่เป็น IndexLabel หรือไม่ (เนื่องจากมีการทำ indexer ค่าที่ได้จึงเป็น Numerical)

### Negative = Fully Paid

In [147]:
result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|  

In [148]:
FP = result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [149]:
(FP/NUM_ACTUAL_NEGATIVE)*100

38.48786872042686

In [150]:
result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|  

In [151]:
TN = result_RF.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [152]:
(TN/NUM_ACTUAL_NEGATIVE)*100

61.51213127957313

### การประเมิน Accuracy

In [153]:
NUM_ACTUAL_NEGATIVE + NUM_ACTUAL_POSITIVE

56005

In [154]:
(TN+TP)/(NUM_ACTUAL_NEGATIVE + NUM_ACTUAL_POSITIVE)

0.640442817605571

In [155]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [156]:
evaluator_RF = MulticlassClassificationEvaluator(predictionCol="prediction",\
                                              labelCol='indexedLabel', metricName='accuracy')
evaluator_RF.evaluate(result_RF)*100

                                                                                

64.0442817605571

### ------------- สิ้นสุด Random Forest ---------------------------

### Training with Multi layer Perceptron (ANN)

In [157]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [158]:
pipeline_mlp = Pipeline(stages=[gradeIndexer,gradeOneHotEncoder,\
                                    homeIndexer,homeOneHotEncoder,\
                                    emp_lengthIndexer,emp_lengthOneHotEncoder,\
                                    purposeIndexer,purposeOneHotEncoder,\
                                    verification_statusIndexer,verification_statusOneHotEncoder,\
                                    addr_stateIndexer, addr_stateOneHotEncoder,\
                                    labelIndexer,\
                                    featureAssembler])

In [159]:
#training_mlp, test_mlp = normalized_df.randomSplit([0.6,0.4])

In [160]:
training_mlp = training_dt

In [161]:
test_mlp = test_dt

In [162]:
train_df_features = pipeline_mlp.fit(training_mlp).transform(training_mlp)

                                                                                

In [163]:
layers = [train_df_features.schema["***features"].metadata["ml_attr"]["num_attrs"],20 ,10, 2]

In [164]:
layers

[68, 20, 10, 2]

### Implementation of MLP

In [165]:
clf = MultilayerPerceptronClassifier(labelCol='indexedLabel'\
                                    ,featuresCol='***features'\
                                    ,layers = layers)

In [166]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [167]:
pipeline_mlp_test = Pipeline(stages=[gradeIndexer,gradeOneHotEncoder,\
                                    homeIndexer,homeOneHotEncoder,\
                                    emp_lengthIndexer,emp_lengthOneHotEncoder,\
                                    purposeIndexer,purposeOneHotEncoder,\
                                    verification_statusIndexer,verification_statusOneHotEncoder,\
                                    addr_stateIndexer, addr_stateOneHotEncoder,\
                                    labelIndexer,\
                                    featureAssembler,\
                                    clf])

In [168]:
#training_mlp_test, test_mlp_test = normalized_df.randomSplit([0.6,0.4])

In [169]:
training_mlp.groupBy('loan_status').count().orderBy(['loan_status']).show()



+-----------+------+
|loan_status| count|
+-----------+------+
|Charged Off|105050|
| Fully Paid|119139|
+-----------+------+



                                                                                

In [170]:
training_mlp.count()

                                                                                

224189

In [171]:
start_time_mlp = t.time()

In [172]:
model_mlp = pipeline_mlp_test.fit(training_mlp)

22/11/03 16:06:55 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/11/03 16:06:55 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [173]:
duration_mlp = t.time() - start_time_mlp

In [174]:
print('ใช้เวลา Distributed Training {0} วินาที'.format(duration_mlp))

ใช้เวลา Distributed Training 53.62253165245056 วินาที


In [175]:
result_mlp = model_mlp.transform(test_mlp.dropna())\
.withColumn('full_prediction',spark_fullLabel_func(F.col('prediction')))

In [176]:
result_mlp.groupBy('loan_status').count().show()



+-----------+-----+
|loan_status|count|
+-----------+-----+
| Fully Paid|29799|
|Charged Off|26206|
+-----------+-----+



                                                                                

In [177]:
result_mlp.count()

                                                                                

56005

In [178]:
result_mlp

DataFrame[annual_inc: double, bc_util: double, inq_fi: double, inq_last_12m: double, home_ownership: string, purpose: string, emp_length: string, revol_bal: double, dti: double, delinq_2yrs: double, pub_rec_bankruptcies: double, pub_rec: double, open_rv_24m: double, mort_acc: double, num_actv_bc_tl: double, num_actv_rev_tl: double, num_il_tl: double, num_tl_90g_dpd_24m: double, int_rate: double, inq_last_6mths: double, term: string, installment: double, total_rev_hi_lim: double, total_bal_il: double, total_bal_ex_mort: double, total_acc: double, tot_cur_bal: double, loan_amnt: double, loan_status: string, verification_status: string, collections_12_mths_ex_med: double, chargeoff_within_12_mths: double, il_util: double, last_pymnt_amnt: double, addr_state: string, out_prncp_inv: double, out_prncp: double, total_pymnt_inv: double, total_pymnt: double, grade: string, gradeIndexed: double, gradeVec: vector, homeIndexed: double, homeVec: vector, emp_lengthIndexed: double, emp_lengthVec: vec

### การประเมิน Charge-Off: TPR, FNR (Recall, Miss rate) 

In [179]:
result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|  

                                                                                

In [180]:
NUM_ACTUAL_POSITIVE = result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [181]:
NUM_ACTUAL_POSITIVE

26206

In [182]:
#ตรวจสอบความถูกต้องจากการ Predict ตรงกับข้อมูลที่เป็น IndexLabel หรือไม่ (เนื่องจากมีการทำ indexer ค่าที่ได้จึงเป็น Numerical)

### Positive = Charged Off

In [183]:
result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|       0.0|     Fully Paid|
|Charged Off|         1.0|  

                                                                                

In [184]:
FN = result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [185]:
FN

9679

In [186]:
(FN/NUM_ACTUAL_POSITIVE)*100

36.93428985728459

In [187]:
result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Charged Off').show()



+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|       1.0|    Charged Off|
|Charged Off|         1.0|  

                                                                                

In [188]:
TP = result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Charged Off').count()

                                                                                

In [189]:
(TP/NUM_ACTUAL_POSITIVE)*100

63.0657101427154

### การประเมิน Fully Paid: TNR, FPR (Specificity, Fall-out) 

In [190]:
result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|  

In [191]:
NUM_ACTUAL_NEGATIVE = result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [192]:
NUM_ACTUAL_NEGATIVE

29799

In [193]:
#ตรวจสอบความถูกต้องจากการ Predict ตรงกับข้อมูลที่เป็น IndexLabel หรือไม่ (เนื่องจากมีการทำ indexer ค่าที่ได้จึงเป็น Numerical)

### Negative = Fully Paid

In [194]:
result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|       1.0|    Charged Off|
| Fully Paid|         0.0|  

In [195]:
FP = result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') != col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [196]:
(FP/NUM_ACTUAL_NEGATIVE)*100

35.06829088224437

In [197]:
result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').show()

+-----------+------------+----------+---------------+
|loan_status|indexedLabel|prediction|full_prediction|
+-----------+------------+----------+---------------+
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|       0.0|     Fully Paid|
| Fully Paid|         0.0|  

In [198]:
TN = result_mlp.select('loan_status','indexedLabel','prediction','full_prediction').\
filter((F.col('indexedLabel') == col('prediction'))).filter(F.col('loan_status') == 'Fully Paid').count()

                                                                                

In [199]:
(TN/NUM_ACTUAL_NEGATIVE)*100

64.93170911775563

### การประเมิน Accuracy

In [200]:
NUM_ACTUAL_NEGATIVE + NUM_ACTUAL_POSITIVE

56005

In [201]:
(TN+TP)/(NUM_ACTUAL_NEGATIVE + NUM_ACTUAL_POSITIVE)

0.6405856619944648

In [202]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [203]:
evaluator_RF = MulticlassClassificationEvaluator(predictionCol="prediction",\
                                              labelCol='indexedLabel', metricName='accuracy')
evaluator_RF.evaluate(result_mlp)*100

                                                                                

64.05856619944647

### ------------- สิ้นสุด MLP ---------------------------

In [204]:
#model_mlp.save('./modelMLP_bestloanpayment')