In [None]:
# Cell 1: Imports and MySQL Initialization with Temporary DF Creation and Cleaning
import os, sys
sys.path.append(os.path.abspath('..'))  # allow importing project root modules

import polars as pl
import pandas as pd
from sqlalchemy import create_engine, text
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

from quality_reports import DataQualityReport
from value_reports import ValueDistributionReport

# Connection and base table (using Trino to query MySQL catalog)
conn_str = "trino://root@3.108.199.0:32092/mysql/ap"  # Assuming no password; add :panoiq if needed
base_table_name = "people_data"  # original table

# Create engine
engine = create_engine(conn_str, connect_args={'http_scheme': 'http', 'auth': None})

# Fetch data and create temporary Pandas DF
query = f"SELECT * FROM {base_table_name} LIMIT 1000000"
try:
    df_pandas = pd.read_sql(text(query), engine)
    logging.info(f"Fetched {len(df_pandas)} rows from {base_table_name}")
except Exception as e:
    logging.error(f"Failed to fetch data: {str(e)}")
    raise

# Convert to temporary Polars DF
df = pl.from_pandas(df_pandas)

# Perform cleaning and data handling on the temporary Polars DF
string_cols = [col for col, dtype in df.schema.items() if dtype == pl.Utf8 and col != 'RecordID']
for col in string_cols:
    df = df.with_columns(
        pl.col(col).str.strip_chars().str.replace_all(r'[\n\r\t%]', ' ')
        .str.replace_all(r'\s+', ' ').replace({'': None, 'nan': None, 'None': None, r'\N': None})
    )

# Add RecordID if not present
if 'RecordID' not in df.columns:
    df = df.with_row_index(name='RecordID', offset=1)

logging.info(f"Sanitized temporary Polars DF, columns: {df.columns}")

# Initialize reports
dq_report = DataQualityReport(conn_str, base_table_name)
vd_report = ValueDistributionReport(conn_str, base_table_name, use_altair=True)

# Set the cleaned temporary DF to the report instances
dq_report.df = df
vd_report.df = df

print(f"📊 Reports initialized for table: {base_table_name}")
print(f"🧹 Created and sanitized temporary Polars DF from {base_table_name} (no views)")
print(f"   • Removed newlines (\\n), tabs (\\t), carriage returns (\\r), and % signs (replaced with space)")
print(f"   • Normalized multiple spaces to single space")
print(f"   • Treated empty strings, 'nan', 'None', and '\\N' as NULL")
print(f"📊 Reports will use this cleaned temporary DF for all analysis")

2025-09-09 12:41:22,648 - INFO - Fetched 50578 rows from people_data
2025-09-09 12:41:23,018 - INFO - Sanitized temporary Polars DF, columns: ['RecordID', 'full_name', 'first_and_surname', 'first_name', 'surname', 'dob', 'birth_place', 'postcode_fake', 'gender', 'occupation', 'email', 'phone', 'address', 'city', 'country', 'postal_code']
2025-09-09 12:41:23,020 - INFO - Initialized DataQualityReport for people_data_view
2025-09-09 12:41:23,021 - INFO - Initialized ValueDistributionReport for people_data_view


📊 Reports initialized for table: people_data
🧹 Created and sanitized temporary Polars DF from people_data (no views)
   • Removed newlines (\n), tabs (\t), carriage returns (\r), and % signs (replaced with space)
   • Normalized multiple spaces to single space
   • Treated empty strings, 'nan', 'None' as NULL
   • DOB conversion deferred to report classes
📊 Reports will use this cleaned temporary DF for all analysis


In [2]:
# Cell 2: MySQL Completeness Report (uses the temporary cleaned DF)
completeness_df = dq_report.get_completeness()

In [3]:
from quality_reports import DataQualityReport

dq_report.create_animated_dashboard(height=800)
print("🎯 Animated dashboard created successfully!")

🎯 Animated dashboard created successfully!


In [4]:

from value_reports import ValueDistributionReport

try:
    results = vd_report.analyze_all_columns(top_n=10, show_individual_plots=True)
    print("📊 Value Distribution Analysis Complete!")
    print(f"✅ Analyzed {len(results)} columns with clean, sanitized data")
except Exception as e:
    print(f"Error during analysis: {str(e)}")
    import traceback
    traceback.print_exc()  # Print full stack trace for debugging

2025-09-09 12:41:23,710 - INFO - Creating visualization for full_name, type: categorical
2025-09-09 12:41:23,712 - INFO - Preparing categorical data for full_name, dtype: String
2025-09-09 12:41:23,768 - INFO - Creating categorical chart for full_name, data columns: ['full_name', 'Count']


Processing column: full_name, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:23,865 - INFO - Creating visualization for first_and_surname, type: categorical
2025-09-09 12:41:23,870 - INFO - Preparing categorical data for first_and_surname, dtype: String


Processing column: first_and_surname, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:23,921 - INFO - Creating categorical chart for first_and_surname, data columns: ['first_and_surname', 'Count']


2025-09-09 12:41:24,019 - INFO - Creating visualization for first_name, type: categorical
2025-09-09 12:41:24,020 - INFO - Preparing categorical data for first_name, dtype: String


Processing column: first_name, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:24,058 - INFO - Creating categorical chart for first_name, data columns: ['first_name', 'Count']


2025-09-09 12:41:24,225 - INFO - Creating visualization for surname, type: categorical
2025-09-09 12:41:24,226 - INFO - Preparing categorical data for surname, dtype: String
2025-09-09 12:41:24,244 - INFO - Creating categorical chart for surname, data columns: ['surname', 'Count']


Processing column: surname, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:24,422 - INFO - Creating visualization for dob, type: datetime
2025-09-09 12:41:24,424 - INFO - Creating histogram for dob, x_label: dob, dtype: Date
2025-09-09 12:41:24,427 - INFO - Creating percentile chart for dob


Processing column: dob, type: datetime
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:24,656 - INFO - Creating visualization for birth_place, type: categorical
2025-09-09 12:41:24,657 - INFO - Preparing categorical data for birth_place, dtype: String
2025-09-09 12:41:24,678 - INFO - Creating categorical chart for birth_place, data columns: ['birth_place', 'Count']


Processing column: birth_place, type: categorical
vc columns after rename: ['value', 'Count']


Processing column: postcode_fake, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:24,838 - INFO - Creating visualization for postcode_fake, type: categorical
2025-09-09 12:41:24,839 - INFO - Preparing categorical data for postcode_fake, dtype: String
2025-09-09 12:41:24,879 - INFO - Creating categorical chart for postcode_fake, data columns: ['postcode_fake', 'Count']


2025-09-09 12:41:25,014 - INFO - Creating visualization for gender, type: categorical
2025-09-09 12:41:25,016 - INFO - Preparing categorical data for gender, dtype: String
2025-09-09 12:41:25,036 - INFO - Creating categorical chart for gender, data columns: ['gender', 'Count']


Processing column: gender, type: categorical
vc columns after rename: ['value', 'Count']


Processing column: occupation, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:25,183 - INFO - Creating visualization for occupation, type: categorical
2025-09-09 12:41:25,185 - INFO - Preparing categorical data for occupation, dtype: String
2025-09-09 12:41:25,200 - INFO - Creating categorical chart for occupation, data columns: ['occupation', 'Count']


2025-09-09 12:41:25,270 - INFO - Creating visualization for email, type: categorical
2025-09-09 12:41:25,271 - INFO - Preparing categorical data for email, dtype: String
2025-09-09 12:41:25,309 - INFO - Creating categorical chart for email, data columns: ['email', 'Count']


Processing column: email, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:25,376 - INFO - Creating visualization for phone, type: categorical
2025-09-09 12:41:25,377 - INFO - Preparing categorical data for phone, dtype: String
2025-09-09 12:41:25,420 - INFO - Creating categorical chart for phone, data columns: ['phone', 'Count']


Processing column: phone, type: categorical
vc columns after rename: ['value', 'Count']


Processing column: address, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:25,568 - INFO - Creating visualization for address, type: categorical
2025-09-09 12:41:25,571 - INFO - Preparing categorical data for address, dtype: String
2025-09-09 12:41:25,654 - INFO - Creating categorical chart for address, data columns: ['address', 'Count']


2025-09-09 12:41:25,776 - INFO - Creating visualization for city, type: categorical
2025-09-09 12:41:25,777 - INFO - Preparing categorical data for city, dtype: String
2025-09-09 12:41:25,804 - INFO - Creating categorical chart for city, data columns: ['city', 'Count']


Processing column: city, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:26,066 - INFO - Creating visualization for country, type: categorical
2025-09-09 12:41:26,068 - INFO - Preparing categorical data for country, dtype: String
2025-09-09 12:41:26,120 - INFO - Creating categorical chart for country, data columns: ['country', 'Count']


Processing column: country, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:26,282 - INFO - Creating visualization for postal_code, type: categorical
2025-09-09 12:41:26,286 - INFO - Preparing categorical data for postal_code, dtype: String
2025-09-09 12:41:26,312 - INFO - Creating categorical chart for postal_code, data columns: ['postal_code', 'Count']


Processing column: postal_code, type: categorical
vc columns after rename: ['value', 'Count']


2025-09-09 12:41:26,391 - INFO - Preparing categorical data for full_name, dtype: String
2025-09-09 12:41:26,427 - INFO - Creating categorical chart for full_name, data columns: ['full_name', 'Count']
2025-09-09 12:41:26,500 - INFO - Preparing categorical data for first_and_surname, dtype: String
2025-09-09 12:41:26,540 - INFO - Creating categorical chart for first_and_surname, data columns: ['first_and_surname', 'Count']
2025-09-09 12:41:26,593 - INFO - Preparing categorical data for first_name, dtype: String
2025-09-09 12:41:26,612 - INFO - Creating categorical chart for first_name, data columns: ['first_name', 'Count']
2025-09-09 12:41:26,673 - INFO - Preparing categorical data for surname, dtype: String
2025-09-09 12:41:26,689 - INFO - Creating categorical chart for surname, data columns: ['surname', 'Count']
2025-09-09 12:41:26,741 - INFO - Creating histogram for dob, x_label: dob, dtype: Date
2025-09-09 12:41:26,744 - INFO - Preparing categorical data for birth_place, dtype: Stri

📊 Value Distribution Analysis Complete!
✅ Analyzed 15 columns with clean, sanitized data
