In [2]:
import pandas as pd
from sqlalchemy import create_engine

def load_file(filepath, columns):
    """Load specific columns from a CSV file."""
    return pd.read_csv(filepath, usecols=columns)

def validate_data(df, validation_rules):
    """Apply regex patterns from a dictionary to validate data in dataframe columns."""
    for column, pattern in validation_rules.items():
        # Ensure the column data is treated as string
        df[f'incorrected_format_{column}'] = df[column].astype(str).apply(
            lambda x: 'N/A' if pd.isna(x) or x == 'nan' else ('TRUE' if not pd.Series([x]).str.match(pattern).any() else 'FALSE')
        )
    return df


def compute_summary(df, column):
    """Print summary of validation results for a specific column."""
    num_invalid = df[df[f'incorrected_format_{column}'] == 'TRUE'].shape[0]
    num_na = df[df[f'incorrected_format_{column}'] == 'N/A'].shape[0]
    total_rows = df.shape[0]
    print(f'Percentage of rows with incorrect {column} format: {(num_invalid / total_rows) * 100:.2f}%')
    print(f'Number of rows with incorrect {column} format: {num_invalid}')
    print(f'Number of rows with {column} is NULL: {num_na}')

def save_to_sql(df, engine, table_name):
    """Save DataFrame to SQL table."""
    df.to_sql(table_name, con=engine, if_exists='replace', index=False)

if __name__ == "__main__":
    filepath = 'LoanStats_web.csv'
    columns = ['loan_amnt', 'term']
    df = load_file(filepath, columns)

    validation_rules = {
        'loan_amnt': r'^[0-9]{1,4}$',  # Adjust the regex according to your needs
        'term': r'^(36 months|60 months)$'
    }

    df = validate_data(df, validation_rules)
    for column in validation_rules.keys():
        compute_summary(df, column)

    server = '34.125.58.101'
    database = 'TestDB'
    username = 'SA'
    password = 'Passw0rd123456'
    table_name = "loan_accuracy"

    engine = create_engine(f'mssql+pymssql://{username}:{password}@{server}/{database}')
    #save_to_sql(df, engine, table_name)


Percentage of rows with incorrect loan_amnt format: 100.00%
Number of rows with incorrect loan_amnt format: 1432440
Number of rows with loan_amnt is NULL: 26
Percentage of rows with incorrect term format: 100.00%
Number of rows with incorrect term format: 1432440
Number of rows with term is NULL: 26
