In [1]:
# ! wget https://storage.googleapis.com/tpqi3/LoanStats_web.csv

In [2]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
from sqlalchemy import create_engine
import urllib


In [3]:

def load_file(filepath, columns):
    return pd.read_csv(filepath, usecols=columns)

def check_positive_int(df, column, max_length):
    df[f'incorrected_format_{column}'] = df[column].apply(
        lambda x: 'N/A' if pd.isna(x) else ('TRUE' if not str(x).isdigit() or not 1 <= int(x) <= max_length else 'FALSE')
    )
    return df

def check_digit_and_alphabet(df, column):
    pattern = r"^[a-zA-Z]{1}[0-9]{5}$"
    df[f'incorrected_format_{column}'] = df[column].apply(
        lambda x: 'N/A' if pd.isna(x) else ('TRUE' if not pd.Series(x).str.match(pattern).any() else 'FALSE')
    )
    return df

def check_exactly_fix_digits(df, column, fix_length):
    pattern = fr'^\d{{{fix_length}}}$'
    df[f'incorrected_format_{column}'] = df[column].apply(
        lambda x: 'N/A' if pd.isna(x) else ('TRUE' if not pd.Series(x).str.match(pattern).any() else 'FALSE')
    )
    return df

def check_specific_string(df, column, valid_strings):
    df[f'incorrected_format_{column}'] = df[column].apply(
        lambda x: 'N/A' if pd.isna(x) else ('TRUE' if x.strip() not in valid_strings else 'FALSE')
    )
    return df

def compute_summary(df, column):
    num_invalid = df[df[f'incorrected_format_{column}'] == 'TRUE'].shape[0]
    num_na = df[df[f'incorrected_format_{column}'] == 'N/A'].shape[0]
    total_rows = df.shape[0]
    print(f'Percentage of rows with incorrect {column} format: {(num_invalid/total_rows)*100}%')
    print(f'Number of rows with incorrect {column} format: {num_invalid}')
    print(f'Number of rows with {column} is NULL: {num_na}')

def save_to_sql(df, engine, table_name):
    # engine = create_engine(url)
    df.to_sql(table_name, con=engine, if_exists='replace', index=False)


In [4]:
if __name__ == "__main__":
    filepath = 'LoanStats_web.csv'
    columns = ['loan_amnt', 'term']
    df = load_file(filepath, columns)

    df = check_positive_int(df, 'loan_amnt', 3000)
    compute_summary(df, 'loan_amnt')

    df = check_specific_string(df, 'term', ['36 months', '60 months'])
    compute_summary(df, 'term')

    # Save to sql

    server = '34.125.58.101'
    database = 'TestDB'
    username = 'SA'
    password = 'Passw0rd123456'
    table_name = "loan_accuracy"
    
    # ตั้งค่าการเชื่อมต่อกับ MSSQL โดยใช้ข้อมูลที่เหมาะสมกับสภาพแวดล้อมของคุณ
    params = urllib.parse.quote_plus("DRIVER={ODBC Driver 17 for SQL Server};SERVER=server;DATABASE=database;UID=username;PWD=password")
    ##### engine = create_engine(f"mssql+pyodbc:///?odbc_connect={params}")
    # Using pymssql
    engine = create_engine(f'mssql+pymssql://{username}:{password}@{server}/{database}')
    
    save_to_sql(df, engine, table_name)


Percentage of rows with incorrect loan_amnt format: 99.99818494819424%
Number of rows with incorrect loan_amnt format: 1432440
Number of rows with loan_amnt is NULL: 26
Percentage of rows with incorrect term format: 0.0%
Number of rows with incorrect term format: 0
Number of rows with term is NULL: 26
