# การทำ t-test - ตัวอย่างโค้ดและคำอธิบายจาก ChatGPT: https://chat.openai.com/share/2c4ac2a4-86f1-414a-8fb1-2bc1d88294e2

In [1]:
# ปรับแต่งค่าการทำงานของ Spark
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("ChiSqSelector:Gender vs. Route").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "1000m").\
        config("spark.executor.cores", "2").\
        config("spark.cores.max", "6").\
        getOrCreate()

23/06/12 13:35:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
# กำหนดตัวแปร

HIGHDISPERSION_LIST= [] #เก็บชื่อคอลัมน์
HIGHNULL_LIST = [] #เก็บชื่อคอลัมน์
NULL_LIST = [] #เก็บชื่อคอลัมน์
NULL_PERC = 0.1 #สูงกว่าค่านี้ เป็น High Null ซึ่งอาจใช้วิธีลบทั้ง row ที่มี null ไม่ได้ เพราะอาจทำให้สูญเสียทั้ง row ไปเยอะเกินควร
DISPERSION_PERC = 100 #สูงกว่าค่านี้ (%) เป็น High Dispersion

In [3]:
from pyspark.sql import functions as sparkf
from pyspark.sql.types import *

In [4]:
#! pip install scipy

In [5]:
# เรียกใช้ Module/Library ของ Python

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Pandas options
pd.set_option('display.max_colwidth', 1000, 'display.max_rows', None, 'display.max_columns', None)

# Plotting options
%matplotlib inline
mpl.style.use('ggplot')
sns.set(style='whitegrid')

import warnings
warnings.filterwarnings("ignore")

In [6]:
# ประกาศฟังก์ชั่นที่ใช้ทำ Histogram กับ Boxplot เพื่อวิเคราะห์ distribution (shape), outlier และความสัมพันธ์ (correlation)

def plot_var(arg_df, col_name, full_name, continuous):
    """
    Visualize a variable with and without faceting on the loan status.
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    # Plot without loan status
    if continuous:
        sns.distplot(arg_df.loc[arg_df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(x=arg_df[col_name], order=sorted(arg_df[col_name].unique()), color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(full_name)
    ax1.set_ylabel('Count')
    ax1.set_title(full_name)

    # Plot with loan status
    if continuous:
        sns.boxplot(x=col_name, y='loan_status', data=arg_df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(full_name + ' by Loan Status')
    else:
        charge_off_rates = arg_df.groupby(col_name)['loan_status'].value_counts(normalize=True).loc[:,'Charged Off']
        sns.barplot(x=charge_off_rates.index, y=charge_off_rates.values, color='#5975A4', saturation=1, ax=ax2)
        ax2.set_ylabel('Fraction of arg_df Charged-off')
        ax2.set_title('Charge-off Rate by ' + full_name)
    ax2.set_xlabel(full_name)
    
    plt.tight_layout()



## 1. Business Understaing
    
    1.1 Problem Statement: ต้องการทราบว่า ลูกหนี้แต่ละรายจะมาจ่ายหนี้ครบตามสัญญาเงินกู้ (Fully-paid) หรือไม่มาจ่ายฯ (Charged-off)
    1.2 Project Objective: การจัดเก็บหนี้ดีขึ้นช่วยเพิ่มรายได้ให้กับกิจการ
    1.3 Task of Data Science: Binary Classification
    1.4 Cleansing Policy: ธุรกิจมี columns ที่แนะนำว่ามีความสัมพันธ์/ส่งผลต่อการชำระหนี้คืนตามสัญญา, ลบได้ทั้ง row หากมี missing ใน columns และแทนที่ได้ตามความเหมาะสม
    1.5 Success Criteria: มี Recall/Sensitivity ไม่น้อยกว่า 0.65 บน Testing set แบบ Hold-out

## 2. Data Understanding
    
    - มี Label เป็น column: loan_status
    - มีขนาดใหญ่ (volume)เกินกว่าเทคโนโลยีปัจจุบัน (Python) จะทำงานได้อย่างมีประสิทธิภาพ จึงต้องใช้ Spark ร่วมด้วย
    - CSV เป็น semi-structural data ที่มี header ซึ่งสามารถนำไปพัฒนาเป็น schema ของ structural data (Spark DataFrame) ได้
    - Data Dict.: https://docs.google.com/spreadsheets/d/1qtZBSJ-JS7S2tGC0W9Yxp992LmrDaAwGcJB419Htbbw/edit#gid=1163295822

In [7]:
#กำหนด columns ที่ธุรกิจให้คำแนะนำฯ ไว้

businessAttrs_df = ["loan_amnt","term","int_rate"\
                                ,"installment","grade","emp_length",\
                           "home_ownership","annual_inc"\
                                ,"verification_status","loan_status",\
                           "purpose","addr_state","dti","delinq_2yrs"\
                                ,"earliest_cr_line",\
                           "open_acc","pub_rec"\
                                ,"revol_bal","revol_util","total_acc","issue_d"]

In [8]:
! wc -l LoanStats_web.csv

1432493 LoanStats_web.csv


In [9]:
! head -3 LoanStats_web.csv

"id","member_id","loan_amnt","funded_amnt","funded_amnt_inv","term","int_rate","installment","grade","sub_grade","emp_title","emp_length","home_ownership","annual_inc","verification_status","issue_d","loan_status","pymnt_plan","url","desc","purpose","title","zip_code","addr_state","dti","delinq_2yrs","earliest_cr_line","inq_last_6mths","mths_since_last_delinq","mths_since_last_record","open_acc","pub_rec","revol_bal","revol_util","total_acc","initial_list_status","out_prncp","out_prncp_inv","total_pymnt","total_pymnt_inv","total_rec_prncp","total_rec_int","total_rec_late_fee","recoveries","collection_recovery_fee","last_pymnt_d","last_pymnt_amnt","next_pymnt_d","last_credit_pull_d","collections_12_mths_ex_med","mths_since_last_major_derog","policy_code","application_type","annual_inc_joint","dti_joint","verification_status_joint","acc_now_delinq","tot_coll_amt","tot_cur_bal","open_acc_6m","open_act_il","open_il_12m","open_il_24m","mths_since_rcnt_il","total_bal_il","il_util","open_rv_1

In [10]:
# Spark อ่านข้อมูลจาก .csv แล้ว convert เป็น DataFrame

raw_df = spark.read.option('header',True)\
.option("quote", "\"")\
.option('mode','DROPMALFORMED')\
.option('inferSchema',True)\
.csv('LoanStats_web.csv')\
.select(businessAttrs_df)

                                                                                

In [11]:
# Spark นับจำนวน row ใน DataFrame

raw_df.count()

23/06/12 13:35:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

1432466

In [12]:
raw_df.printSchema()

root
 |-- loan_amnt: integer (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: string (nullable = true)
 |-- installment: double (nullable = true)
 |-- grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: double (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- dti: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- earliest_cr_line: string (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: integer (nullable = true)
 |-- revol_util: string (nullable = true)
 |-- total_acc: integer (nullable = true)
 |-- issue_d: string (nullable = true)



In [13]:
raw_df.select('loan_amnt','loan_status').describe().show()



+-------+------------------+------------------+
|summary|         loan_amnt|       loan_status|
+-------+------------------+------------------+
|  count|           1432440|           1432440|
|   mean|15370.388358325654|              null|
| stddev| 9646.026272413836|              null|
|    min|              1000|       Charged Off|
|    max|             40000|Late (31-120 days)|
+-------+------------------+------------------+



                                                                                

In [14]:
pandas_df = raw_df.select('loan_amnt','loan_status').dropna().toPandas()

                                                                                

In [15]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432440 entries, 0 to 1432439
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   loan_amnt    1432440 non-null  int32 
 1   loan_status  1432440 non-null  object
dtypes: int32(1), object(1)
memory usage: 16.4+ MB


In [16]:
import scipy.stats as stats

# การคำนวณ p-value จากการทดสอบ t-test สำหรับการเปรียบเทียบค่าเฉลี่ยของ loan_amnt ระหว่าง loan_status
group1 = pandas_df[pandas_df["loan_status"] == "Fully Paid"]["loan_amnt"]
group2 = pandas_df[pandas_df["loan_status"] == "Charged Off"]["loan_amnt"]

t_stat, p_value = stats.ttest_ind(group1, group2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")


t-statistic: -61.9248612596206
p-value: 0.0


In [44]:
raw_df.select('annual_inc','loan_status').describe().show()



+-------+------------------+------------------+
|summary|        annual_inc|       loan_status|
+-------+------------------+------------------+
|  count|           1432440|           1432440|
|   mean| 81034.58293296752|              null|
| stddev|134183.35696714383|              null|
|    min|               0.0|       Charged Off|
|    max|             6.1E7|Late (31-120 days)|
+-------+------------------+------------------+



                                                                                

In [45]:
pandas_df = raw_df.select('annual_inc','loan_status').dropna().toPandas()

                                                                                

In [46]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432440 entries, 0 to 1432439
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   annual_inc   1432440 non-null  float64
 1   loan_status  1432440 non-null  object 
dtypes: float64(1), object(1)
memory usage: 21.9+ MB


In [47]:
import scipy.stats as stats

# การคำนวณ p-value จากการทดสอบ t-test สำหรับการเปรียบเทียบค่าเฉลี่ยของ annual_inc ระหว่าง loan_status
group1 = pandas_df[pandas_df["loan_status"] == "Fully Paid"]["annual_inc"]
group2 = pandas_df[pandas_df["loan_status"] == "Charged Off"]["annual_inc"]

t_stat, p_value = stats.ttest_ind(group1, group2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")


t-statistic: 31.88305276676603
p-value: 6.638497756538794e-223


In [17]:
raw_df.select('pub_rec','loan_status').describe().show()



+-------+-------------------+------------------+
|summary|            pub_rec|       loan_status|
+-------+-------------------+------------------+
|  count|            1432439|           1432440|
|   mean|0.20539234131435963|              null|
| stddev| 0.5861093697871824|              null|
|    min|                  0|       Charged Off|
|    max|                 61|Late (31-120 days)|
+-------+-------------------+------------------+



                                                                                

In [18]:
pandas_df = raw_df.select('pub_rec','loan_status').dropna().toPandas()

                                                                                

In [19]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432439 entries, 0 to 1432438
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   pub_rec      1432439 non-null  int32 
 1   loan_status  1432439 non-null  object
dtypes: int32(1), object(1)
memory usage: 16.4+ MB


In [20]:
import scipy.stats as stats

# การคำนวณ p-value จากการทดสอบ t-test สำหรับการเปรียบเทียบค่าเฉลี่ยของ pub_rec ระหว่าง loan_status
group1 = pandas_df[pandas_df["loan_status"] == "Fully Paid"]["pub_rec"]
group2 = pandas_df[pandas_df["loan_status"] == "Charged Off"]["pub_rec"]

t_stat, p_value = stats.ttest_ind(group1, group2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")


t-statistic: -22.649620378468168
p-value: 1.5465007725031826e-113


In [25]:
raw_df.select('open_acc','loan_status').describe().show()



+-------+------------------+------------------+
|summary|          open_acc|       loan_status|
+-------+------------------+------------------+
|  count|           1432440|           1432440|
|   mean|11.710318756806569|              null|
| stddev| 5.841056149724124|              null|
|    min|                 0|       Charged Off|
|    max|               101|Late (31-120 days)|
+-------+------------------+------------------+



                                                                                

In [26]:
pandas_df = raw_df.select('open_acc','loan_status').dropna().toPandas()

                                                                                

In [27]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432440 entries, 0 to 1432439
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   open_acc     1432440 non-null  int32 
 1   loan_status  1432440 non-null  object
dtypes: int32(1), object(1)
memory usage: 16.4+ MB


In [28]:
import scipy.stats as stats

# การคำนวณ p-value จากการทดสอบ t-test สำหรับการเปรียบเทียบค่าเฉลี่ยของ open_acc ระหว่าง loan_status
group1 = pandas_df[pandas_df["loan_status"] == "Fully Paid"]["open_acc"]
group2 = pandas_df[pandas_df["loan_status"] == "Charged Off"]["open_acc"]

t_stat, p_value = stats.ttest_ind(group1, group2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")


t-statistic: -12.916414225912762
p-value: 3.6737984709132366e-38


In [29]:
raw_df.select('term').describe().show()



+-------+----------+
|summary|      term|
+-------+----------+
|  count|   1432440|
|   mean|      null|
| stddev|      null|
|    min| 36 months|
|    max| 60 months|
+-------+----------+



                                                                                

In [30]:
raw_df.groupBy('term').count().show()

                                                                                

+----------+-------+
|      term|  count|
+----------+-------+
| 36 months|1036235|
|      null|     26|
| 60 months| 396205|
+----------+-------+



In [31]:
from pyspark.sql import functions as sparkf

In [32]:
raw_df.select('term','loan_status').withColumn('term',sparkf.regexp_replace(sparkf.col('term'),' months',''))\
.groupBy('term').count().show()

                                                                                

+----+-------+
|term|  count|
+----+-------+
|  60| 396205|
|null|     26|
|  36|1036235|
+----+-------+



In [39]:
from pyspark.sql.types import *

In [41]:
pandas_df = raw_df\
.select('term','loan_status')\
.withColumn('term',sparkf.regexp_replace(sparkf.col('term'),' months',''))\
.withColumn('term',sparkf.col('term').cast(IntegerType()))\
.dropna().toPandas()

                                                                                

In [42]:
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432440 entries, 0 to 1432439
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   term         1432440 non-null  int32 
 1   loan_status  1432440 non-null  object
dtypes: int32(1), object(1)
memory usage: 16.4+ MB


In [43]:
import scipy.stats as stats

# การคำนวณ p-value จากการทดสอบ t-test สำหรับการเปรียบเทียบค่าเฉลี่ยของ open_acc ระหว่าง loan_status
group1 = pandas_df[pandas_df["loan_status"] == "Fully Paid"]["term"]
group2 = pandas_df[pandas_df["loan_status"] == "Charged Off"]["term"]

t_stat, p_value = stats.ttest_ind(group1, group2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")


t-statistic: -129.98767251616562
p-value: 0.0
