# Logistic Regression โดยใช้ ChatGPT ช่วยเขียนโค้ด https://chat.openai.com/share/9848fbef-20c1-4743-be8d-04b442446d54

In [1]:
# ปรับแต่งค่าการทำงานของ Spark
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("ChiSqSelector:Gender vs. Route").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/06/12 14:26:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# กำหนดตัวแปร

HIGHDISPERSION_LIST= [] #เก็บชื่อคอลัมน์
HIGHNULL_LIST = [] #เก็บชื่อคอลัมน์
NULL_LIST = [] #เก็บชื่อคอลัมน์
NULL_PERC = 0.1 #สูงกว่าค่านี้ เป็น High Null ซึ่งอาจใช้วิธีลบทั้ง row ที่มี null ไม่ได้ เพราะอาจทำให้สูญเสียทั้ง row ไปเยอะเกินควร
DISPERSION_PERC = 100 #สูงกว่าค่านี้ (%) เป็น High Dispersion

In [3]:
from pyspark.sql import functions as sparkf
from pyspark.sql.types import *

In [4]:
#! pip install scipy

In [5]:
# เรียกใช้ Module/Library ของ Python

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas options
pd.set_option('display.max_colwidth', 1000, 'display.max_rows', None, 'display.max_columns', None)

# Plotting options
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style='whitegrid')

import warnings
warnings.filterwarnings("ignore")

In [6]:
# ประกาศฟังก์ชั่นที่ใช้ทำ Histogram กับ Boxplot เพื่อวิเคราะห์ distribution (shape), outlier และความสัมพันธ์ (correlation)

def plot_var(arg_df, col_name, full_name, continuous):
    """
    Visualize a variable with and without faceting on the loan status.
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    # Plot without loan status
    if continuous:
        sns.distplot(arg_df.loc[arg_df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(x=arg_df[col_name], order=sorted(arg_df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(full_name)
    ax1.set_ylabel('Count')
    ax1.set_title(full_name)

    # Plot with loan status
    if continuous:
        sns.boxplot(x=col_name, y='loan_status', data=arg_df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(full_name + ' by Loan Status')
    else:
        charge_off_rates = arg_df.groupby(col_name)['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']
        sns.barplot(x=charge_off_rates.index, y=charge_off_rates.values, color='#5975A4', saturation=1, ax=ax2)
        ax2.set_ylabel('Fraction of arg_df Charged-off')
        ax2.set_title('Charge-off Rate by ' + full_name)
    ax2.set_xlabel(full_name)
    
    plt.tight_layout()



## 1. Business Understaing
    
    1.1 Problem Statement: ต้องการทราบว่า ลูกหนี้แต่ละรายจะมาจ่ายหนี้ครบตามสัญญาเงินกู้ (Fully-paid) หรือไม่มาจ่ายฯ (Charged-off)
    1.2 Project Objective: การจัดเก็บหนี้ดีขึ้นช่วยเพิ่มรายได้ให้กับกิจการ
    1.3 Task of Data Science: Binary Classification
    1.4 Cleansing Policy: ธุรกิจมี columns ที่แนะนำว่ามีความสัมพันธ์/ส่งผลต่อการชำระหนี้คืนตามสัญญา, ลบได้ทั้ง row หากมี missing ใน columns และแทนที่ได้ตามความเหมาะสม
    1.5 Success Criteria: มี Recall/Sensitivity ไม่น้อยกว่า 0.65 บน Testing set แบบ Hold-out

## 2. Data Understanding
    
    - มี Label เป็น column: loan_status
    - มีขนาดใหญ่ (volume)เกินกว่าเทคโนโลยีปัจจุบัน (Python) จะทำงานได้อย่างมีประสิทธิภาพ จึงต้องใช้ Spark ร่วมด้วย
    - CSV เป็น semi-structural data ที่มี header ซึ่งสามารถนำไปพัฒนาเป็น schema ของ structural data (Spark DataFrame) ได้
    - Data Dict.: https://docs.google.com/spreadsheets/d/1qtZBSJ-JS7S2tGC0W9Yxp992LmrDaAwGcJB419Htbbw/edit#gid=1163295822

In [7]:
#กำหนด columns ที่ธุรกิจให้คำแนะนำฯ ไว้

businessAttrs_df = ["loan_amnt","term","int_rate"\
                                ,"installment","grade","emp_length",\
                           "home_ownership","annual_inc"\
                                ,"verification_status","loan_status",\
                           "purpose","addr_state","dti","delinq_2yrs"\
                                ,"earliest_cr_line",\
                           "open_acc","pub_rec"\
                                ,"revol_bal","revol_util","total_acc","issue_d"]

In [8]:
! wc -l LoanStats_web.csv

1432493 LoanStats_web.csv


In [9]:
! head -3 LoanStats_web.csv

"id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","loan_status","pymnt_plan","url","desc","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","annual_inc_joint","dti_joint","verification_status_joint","acc_now_delinq","tot_coll_amt","tot_cur_bal","open_acc_6m","open_act_il","open_il_12m","open_il_24m","mths_since_rcnt_il","total_bal_il","il_util","open_rv_1

In [10]:
# Spark อ่านข้อมูลจาก .csv แล้ว convert เป็น DataFrame

raw_df = spark.read.option('header',True)\
.option("quote", "\"")\
.option('mode','DROPMALFORMED')\
.option('inferSchema',True)\
.csv('LoanStats_web.csv')\
.select(businessAttrs_df)

                                                                                

In [11]:
# Spark นับจำนวน row ใน DataFrame

raw_df.count()

23/06/12 14:27:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

1432466

In [12]:
raw_df.printSchema()

root
 |-- loan_amnt: integer (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: string (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- earliest_cr_line: string (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- revol_util: string (nullable = true)
 |-- total_acc: integer (nullable = true)
 |-- issue_d: string (nullable = true)



In [13]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

df = raw_df.select('loan_amnt','loan_status').dropna()

# สร้าง StringIndexer สำหรับ loan_status
indexer = StringIndexer(inputCol="loan_status", outputCol="loan_status_index")

# กำหนดคอลัมน์คุณลักษณะ (features) และคอลัมน์ label
feature_cols = ["loan_amnt"]
label_col = 'loan_status_index'

# สร้าง VectorAssembler เพื่อเตรียม features สำหรับโมเดล
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')

# สร้าง Logistic Regression Model
lr = LogisticRegression(featuresCol='features', labelCol=label_col)

# สร้าง Pipeline
pipeline = Pipeline(stages=[indexer, assembler, lr])

# ฝึกโมเดลด้วยข้อมูล
model = pipeline.fit(df)

# แสดงค่าสัมประสิทธิ์และจุดตัดแกน
print("Coefficient Matrix:\n", model.stages[-1].coefficientMatrix)
print("Intercept:\n", model.stages[-1].interceptVector)

# ตีความผลลัพธ์
indexerModel = model.stages[0]
labels = indexerModel.labels
for i, label in enumerate(labels):
    print(f"For '{label}', increasing the loan_amnt by 1 unit changes the log-odds by {model.stages[-1].coefficientMatrix[i,0]} and the baseline log-odds is {model.stages[-1].interceptVector[i]}")


23/06/12 14:27:51 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/06/12 14:27:51 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

Coefficient Matrix:
 DenseMatrix([[-8.39450260e-08],
             [-2.23305203e-05],
             [-3.58586062e-06],
             [ 8.03578216e-06],
             [ 1.21715836e-05],
             [ 5.36224449e-06],
             [ 4.30715689e-07]])
Intercept:
 [3.600480184986742,3.6962485055588017,2.095058836467167,-0.1568018748944109,-1.0896159039396072,-1.6075993173585088,-6.537770430820183]
For 'Current', increasing the loan_amnt by 1 unit changes the log-odds by -8.394502603476724e-08 and the baseline log-odds is 3.600480184986742
For 'Fully Paid', increasing the loan_amnt by 1 unit changes the log-odds by -2.233052025496855e-05 and the baseline log-odds is 3.6962485055588017
For 'Charged Off', increasing the loan_amnt by 1 unit changes the log-odds by -3.585860621990128e-06 and the baseline log-odds is 2.095058836467167
For 'Late (31-120 days)', increasing the loan_amnt by 1 unit changes the log-odds by 8.035782163707356e-06 and the baseline log-odds is -0.1568018748944109
For 'In Gr