# 1. Profiling

In [1]:
import pandas as pd
# pd.set_option('display.float_format', lambda x: '%.4f' % x)
# pd.set_option('display.max_rows', None)  # None หมายถึงไม่จำกัดจำนวนแถวที่จะแสดง


# 2. Data Rules and Use them for Measurement

In [2]:
# Define your data rules here
data_rules = {
    'int_rate': r'^\d+\.\d{2}$',
    'emp_length': r'^(less than 1 year|1 year|2 years|3 years|4 years|5 years|6 years|7 years|8 years|9 years|10\+ years)$',
    'loan_amnt': lambda x: "NULL" if pd.isna(x) else ("TRUE" if 1000 <= float(x) <= 40000 else "FALSE"),
    'loan_status': r'^(Charged Off|Current|Default|Fully Paid|In Grace Period|Late \(16-30 days\)|Late \(31-120 days\))$'
}

In [3]:
import re
import pandas as pd
from sqlalchemy import create_engine
import urllib

def load_file(filepath, columns):
    """Load specific columns from a CSV file."""
    return pd.read_csv(filepath, usecols=columns)

def validate_data(df, data_rules):
    """Apply regex patterns or custom functions to validate data in dataframe columns."""
    for column, rule in data_rules.items():
        if isinstance(rule, str):  # Regex pattern
            df[f'corrected_format_{column}'] = df[column].apply(
                lambda x: "NULL" if pd.isna(x) else ("TRUE" if re.match(rule, str(x)) else "FALSE")
            )
        elif callable(rule):  # Function to handle complex validations
            df[f'corrected_format_{column}'] = df[column].apply(
                lambda x: "NULL" if pd.isna(x) else rule(x)
            )
    return df

def compute_summary(df, column):
    """Print detailed summary for validation results of a specific column."""
    total_rows = df.shape[0]
    num_na = (df[f'corrected_format_{column}'] == "NULL").sum()
    num_not_na = total_rows - num_na
    num_correct = (df[f'corrected_format_{column}'] == "TRUE").sum()
    num_incorrect = (df[f'corrected_format_{column}'] == "FALSE").sum()

    print(f'Total rows: {total_rows}')
    print(f'Number of rows with {column} is NULL: {num_na}')
    print(f'Number of rows with {column} is not NULL: {num_not_na}')
    print(f'Number of rows with correct {column} format and not null: {num_correct}')
    if num_not_na > 0:  # Prevent division by zero
        print(f'Percentage of rows with correct {column} format and not null: {num_correct / num_not_na * 100:.2f}%')
    else:
        print("No non-null data available to calculate percentage of correct format.")
    print(f'Number of rows with incorrect {column} format and not null: {num_incorrect}')
    if num_not_na > 0:  # Prevent division by zero
        print(f'Percentage of rows with incorrect {column} format and not null: {num_incorrect / num_not_na * 100:.2f}%')
    print("-----------------------------------------------------------------------------------")

def save_to_sql(df, engine, table_name):
    df.to_sql(table_name, con=engine, if_exists='replace', index=False)

In [4]:
# Implement in main script
if __name__ == "__main__":
    filepath = 'LoanStats_web.csv'
    columns = list(data_rules.keys())
    df = load_file(filepath, columns)
    df = validate_data(df, data_rules)
    for column in data_rules.keys():
        compute_summary(df, column)


Total rows: 1432466
Number of rows with int_rate is NULL: 26
Number of rows with int_rate is not NULL: 1432440
Number of rows with correct int_rate format and not null: 0
Percentage of rows with correct int_rate format and not null: 0.00%
Number of rows with incorrect int_rate format and not null: 1432440
Percentage of rows with incorrect int_rate format and not null: 100.00%
-----------------------------------------------------------------------------------
Total rows: 1432466
Number of rows with emp_length is NULL: 108496
Number of rows with emp_length is not NULL: 1323970
Number of rows with correct emp_length format and not null: 1205577
Percentage of rows with correct emp_length format and not null: 91.06%
Number of rows with incorrect emp_length format and not null: 118393
Percentage of rows with incorrect emp_length format and not null: 8.94%
-----------------------------------------------------------------------------------
Total rows: 1432466
Number of rows with loan_amnt is N

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432466 entries, 0 to 1432465
Data columns (total 8 columns):
 #   Column                        Non-Null Count    Dtype  
---  ------                        --------------    -----  
 0   loan_amnt                     1432440 non-null  float64
 1   int_rate                      1432440 non-null  object 
 2   emp_length                    1323970 non-null  object 
 3   loan_status                   1432440 non-null  object 
 4   corrected_format_int_rate     1432466 non-null  object 
 5   corrected_format_emp_length   1432466 non-null  object 
 6   corrected_format_loan_amnt    1432466 non-null  object 
 7   corrected_format_loan_status  1432466 non-null  object 
dtypes: float64(1), object(7)
memory usage: 87.4+ MB


In [6]:
[ n for n in columns]

['int_rate', 'emp_length', 'loan_amnt', 'loan_status']

In [7]:
df[[ n for n in columns]].describe(include='all')

Unnamed: 0,int_rate,emp_length,loan_amnt,loan_status
count,1432440,1323970,1432440.0,1432440
unique,258,11,,7
top,11.49%,10+ years,,Current
freq,49032,478304,,702223
mean,,,15370.39,
std,,,9646.026,
min,,,1000.0,
25%,,,8000.0,
50%,,,13000.0,
75%,,,20000.0,


In [8]:
df[['corrected_format_int_rate']].groupby('corrected_format_int_rate').size()

corrected_format_int_rate
FALSE    1432440
NULL          26
dtype: int64

In [9]:
df[['corrected_format_int_rate']].value_counts()

corrected_format_int_rate
FALSE                        1432440
NULL                              26
Name: count, dtype: int64