# 1. Profiling

In [1]:
import pandas as pd
# pd.set_option('display.float_format', lambda x: '%.4f' % x)
# pd.set_option('display.max_rows', None)  # None หมายถึงไม่จำกัดจำนวนแถวที่จะแสดง


In [2]:
selectedCol_list = ['int_rate']

In [3]:
raw_df = pd.read_csv('LoanStats_web.csv', usecols=selectedCol_list)

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1432466 entries, 0 to 1432465
Data columns (total 1 columns):
 #   Column    Non-Null Count    Dtype 
---  ------    --------------    ----- 
 0   int_rate  1432440 non-null  object
dtypes: object(1)
memory usage: 10.9+ MB


In [5]:
raw_df.describe(include='all')

Unnamed: 0,int_rate
count,1432440
unique,258
top,11.49%
freq,49032


In [6]:
raw_df['int_rate'].str.rstrip('%').astype('float').describe()

count    1.432440e+06
mean     1.299533e+01
std      5.046147e+00
min      5.310000e+00
25%      9.160000e+00
50%      1.213000e+01
75%      1.577000e+01
max      3.099000e+01
Name: int_rate, dtype: float64

In [7]:
# raw_df['int_rate'].str.rstrip('%').astype('float').sort_values(ascending=False)

In [8]:
raw_df.groupby('int_rate').size()

int_rate
  5.31%     8613
  5.32%    36953
  6.00%      493
  6.07%     5019
  6.08%     2968
           ...  
 30.79%     1223
 30.84%      676
 30.89%      484
 30.94%      324
 30.99%      464
Length: 258, dtype: int64

In [9]:
raw_df['int_rate'].value_counts()

int_rate
 11.49%    49032
 12.74%    41977
 13.49%    39202
 13.99%    39032
  5.32%    36953
           ...  
 25.99%        2
 21.99%        2
 15.29%        2
 13.19%        1
 11.16%        1
Name: count, Length: 258, dtype: int64

# 2. Data Rules and Use them for Measurement

In [10]:
# Define your data rules here
data_rules = {
    'int_rate': r'^\d+\.\d{2}$'
}

In [11]:
import re
import pandas as pd
from sqlalchemy import create_engine
import urllib

def load_file(filepath, columns):
    """Load specific columns from a CSV file."""
    return pd.read_csv(filepath, usecols=columns)

def validate_data(df, data_rules):
    """Apply regex patterns or custom functions to validate data in dataframe columns."""
    for column, rule in data_rules.items():
        df[f'corrected_format_{column}'] = df[column].apply(
            lambda x: "NULL" if pd.isna(x) else ("TRUE" if re.match(rule, str(x)) else "FALSE")
        )
    return df

def compute_summary(df, column):
    """Print detailed summary for validation results of a specific column."""
    total_rows = df.shape[0]
    num_na = (df[f'corrected_format_{column}'] == "NULL").sum()
    num_not_na = total_rows - num_na
    num_correct = (df[f'corrected_format_{column}'] == "TRUE").sum()
    num_incorrect = (df[f'corrected_format_{column}'] == "FALSE").sum()

    print(f'Total rows: {total_rows}')
    print(f'Number of rows with {column} is NULL: {num_na}')
    print(f'Number of rows with {column} is not NULL: {num_not_na}')
    print(f'Number of rows with correct {column} format and not null: {num_correct}')
    if num_not_na > 0:  # Prevent division by zero
        print(f'Percentage of rows with correct {column} format and not null: {num_correct / num_not_na * 100:.2f}%')
    else:
        print("No non-null data available to calculate percentage of correct format.")
    print(f'Number of rows with incorrect {column} format and not null: {num_incorrect}')
    if num_not_na > 0:  # Prevent division by zero
        print(f'Percentage of rows with incorrect {column} format and not null: {num_incorrect / num_not_na * 100:.2f}%')
    print("-----------------------------------------------------------------------------------")

def save_to_sql(df, engine, table_name):
    df.to_sql(table_name, con=engine, if_exists='replace', index=False)

In [12]:
# Implement in main script
if __name__ == "__main__":
    filepath = 'LoanStats_web.csv'
    columns = list(data_rules.keys())
    df = load_file(filepath, columns)
    df = validate_data(df, data_rules)
    for column in data_rules.keys():
        compute_summary(df, column)



    # Save to sql
    server = '34.125.58.101'
    database = 'TestDB'
    username = 'SA'
    password = 'Passw0rd123456'
    table_name = "loan_data_assessment"
    
    # ตั้งค่าการเชื่อมต่อกับ MSSQL โดยใช้ข้อมูลที่เหมาะสมกับสภาพแวดล้อมของคุณ
    params = urllib.parse.quote_plus("DRIVER={ODBC Driver 17 for SQL Server};SERVER=server;DATABASE=database;UID=username;PWD=password")
    ##### engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
    # Using pymssql
    engine = create_engine(f'mssql+pymssql://{username}:{password}@{server}/{database}')
    
    save_to_sql(df, engine, table_name)

Total rows: 1432466
Number of rows with int_rate is NULL: 26
Number of rows with int_rate is not NULL: 1432440
Number of rows with correct int_rate format and not null: 0
Percentage of rows with correct int_rate format and not null: 0.00%
Number of rows with incorrect int_rate format and not null: 1432440
Percentage of rows with incorrect int_rate format and not null: 100.00%
-----------------------------------------------------------------------------------
