In [1]:
"""
Example using Pandas with the Titanic dataset.
"""

import pandas as pd

import pysuricata

In [2]:
# Import required modules
import numpy as np
from pysuricata.config import EngineConfig

# Create a large dataset with many missing columns to demonstrate the new functionality
print("🔍 Creating a large dataset with varied missing patterns...")

np.random.seed(42)  # For reproducible results

# Create a dataset with 100 columns and 10,000 rows
data = {}
column_types = []

for i in range(100):
    col_name = f"feature_{i:03d}"
    
    # Vary missing percentages: 0% to 50% missing data
    missing_pct = (i % 11) * 5  # 0%, 5%, 10%, 15%, 20%, 25%, 30%, 35%, 40%, 45%, 50%
    
    # Create different column types with missing data
    if i < 30:  # Numeric columns
        values = np.random.normal(100, 20, 10000)
        column_types.append("numeric")
    elif i < 60:  # Categorical columns
        categories = ['A', 'B', 'C', 'D', 'E']
        values = np.random.choice(categories, 10000)
        column_types.append("categorical")
    elif i < 80:  # Boolean columns
        values = np.random.choice([True, False], 10000)
        column_types.append("boolean")
    else:  # Datetime columns
        values = pd.date_range('2020-01-01', periods=10000, freq='1h')
        column_types.append("datetime")
    
    # Introduce missing values based on the missing percentage
    if missing_pct > 0:
        missing_mask = np.random.random(10000) < (missing_pct / 100)
        values[missing_mask] = None
    
    data[col_name] = values

# Create DataFrame
df = pd.DataFrame(data)

print(f"✅ Dataset created: {df.shape[0]:,} rows × {df.shape[1]:,} columns")
print(f"📊 Column types: {len([t for t in column_types if t == 'numeric'])} numeric, "
      f"{len([t for t in column_types if t == 'categorical'])} categorical, "
      f"{len([t for t in column_types if t == 'boolean'])} boolean, "
      f"{len([t for t in column_types if t == 'datetime'])} datetime")

# Show missing data statistics
missing_stats = df.isnull().sum()
missing_with_data = missing_stats[missing_stats > 0]
print(f"🔍 Columns with missing data: {len(missing_with_data)} out of {len(df.columns)}")
print(f"📈 Missing data percentages: {missing_with_data.min():.1f}% - {missing_with_data.max():.1f}%")

🔍 Creating a large dataset with varied missing patterns...


TypeError: Index does not support mutable operations

In [None]:
# Generate reports with different configurations to demonstrate the intelligent missing columns

print("\n🎯 Generating reports with different configurations...")

# 1. Default configuration (shows the intelligent behavior)
print("\n1️⃣ Default Configuration:")
default_config = EngineConfig()
default_report = pysuricata.profile(df, config=default_config)
default_report.save_html("pandas_missing_columns_default.html")
print("✅ Saved: pandas_missing_columns_default.html")

# 2. Strict configuration (only shows columns with >5% missing)
print("\n2️⃣ Strict Configuration (threshold: 5%):")
strict_config = EngineConfig(
    missing_columns_threshold_pct=5.0,
    missing_columns_max_initial=5,
    missing_columns_max_expanded=15
)
strict_report = pysuricata.profile(df, config=strict_config)
strict_report.save_html("pandas_missing_columns_strict.html")
print("✅ Saved: pandas_missing_columns_strict.html")

# 3. Lenient configuration (shows columns with >0.1% missing)
print("\n3️⃣ Lenient Configuration (threshold: 0.1%):")
lenient_config = EngineConfig(
    missing_columns_threshold_pct=0.1,
    missing_columns_max_initial=15,
    missing_columns_max_expanded=50
)
lenient_report = pysuricata.profile(df, config=lenient_config)
lenient_report.save_html("pandas_missing_columns_lenient.html")
print("✅ Saved: pandas_missing_columns_lenient.html")

print("\n🎉 All reports generated successfully!")
print("\n📚 What to look for in the reports:")
print("• Summary section → Top missing columns")
print("• Dynamic display limits based on dataset size")
print("• 'Show X more...' button for expandable UI")
print("• Smart filtering of insignificant missing data")
print("• Color-coded severity indicators (low/medium/high)")
lenient_report = pysuricata.profile(df, config=lenient_config)
lenient_report.save_html("pandas_missing_columns_lenient.html")
print("✅ Saved: pandas_missing_columns_lenient.html")

print("\n🎉 All reports generated successfully!")
print("\n📚 What to look for in the reports:")
print("• Summary section → Top missing columns")
print("• Dynamic display limits based on dataset size")
print("• 'Show X more...' button for expandable UI")
print("• Smart filtering of insignificant missing data")
print("• Color-coded severity indicators (low/medium/high)")

In [None]:
# Display the default report in the Jupyter Notebook
print("📊 Displaying the default report with intelligent missing columns:")
default_report

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
495,0,3,male,,0,0,14.4583,C,Third,man,True,,Cherbourg,no,True
648,0,3,male,,0,0,7.55,S,Third,man,True,,Southampton,no,True
278,0,3,male,7.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
255,1,3,female,29.0,0,2,15.2458,C,Third,woman,False,,Cherbourg,yes,False
298,1,1,male,,0,0,30.5,S,First,man,True,C,Southampton,yes,True
609,1,1,female,40.0,0,0,153.4625,S,First,woman,False,C,Southampton,yes,True
318,1,1,female,31.0,0,2,164.8667,S,First,woman,False,C,Southampton,yes,False
484,1,1,male,25.0,1,0,91.0792,C,First,man,True,B,Cherbourg,yes,False
367,1,3,female,,0,0,7.2292,C,Third,woman,False,,Cherbourg,yes,True

0,1
Count,891
Unique,2 (≈)
Missing,0 (0.0%)
Mode,0
Mode %,61.6%
Empty strings,549

0,1
Entropy,0.9607
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,0,549,61.6%,
2ⁿᵈ,1,342,38.4%,

Original,lower(),strip()
—,—,—

0,1
Count,891
Unique,3 (≈)
Missing,0 (0.0%)
Mode,3
Mode %,55.1%
Empty strings,0

0,1
Entropy,1.439
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,3,491,55.1%,
2ⁿᵈ,1,216,24.2%,
3ʳᵈ,2,184,20.7%,

Original,lower(),strip()
—,—,—

0,1
Count,891
Unique,2 (≈)
Missing,0 (0.0%)
Mode,male
Mode %,64.8%
Empty strings,0

0,1
Entropy,0.9362
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,male,577,64.8%,
2ⁿᵈ,female,314,35.2%,

Original,lower(),strip()
—,—,—

0,1
Count,714
Unique,88
Missing,177 (19.9%)
Outliers,11 (1.5%)
Zeros,0 (0.0%)
Infinites,0 (0.0%)
Negatives,0 (0.0%)

0,1
Min,0.42
Q1 (P25),20.12
Median,28
Mean,29.7
Q3 (P75),38
Max,80
Processed bytes,7.1 KB (≈)

0,1
Mean,29.7
Std Dev,14.53
Variance,211
Std Error,0.5436
Coeff. of Var,0.4891
Geometric mean,24.43
IQR,17.88
MAD,9
Skew,0.3875
Kurtosis,0.1598

0,1
Min,0.42
P1,1 (≈)
P5,4 (≈)
P10,14 (≈)
Q1 (P25),20.12
Median (P50),28
Q3 (P75),38
P90,50 (≈)
P95,56 (≈)
P99,65.87 (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,18,4,0.6%,
2ⁿᵈ,19,2,0.3%,
3ʳᵈ,25,2,0.3%,
4ᵗʰ,27,2,0.3%,
5ᵗʰ,39,2,0.3%,
6ᵗʰ,22,1,0.1%,
7ᵗʰ,24,1,0.1%,
8ᵗʰ,26,1,0.1%,
9ᵗʰ,28,1,0.1%,
10ᵗʰ,31,1,0.1%,

Index,Value
803,0.42
755,0.67
644,0.75
469,0.75
78,0.83

Index,Value
630,80.0
851,74.0
493,71.0
96,71.0
116,70.5

Rank,Index,Value,Method,Severity,Extremity
1ˢᵗ,630,80.0,IQR Method,High (2.3× IQR),
2ⁿᵈ,630,80.0,MAD Method,Extreme (5.8× MAD),
3ʳᵈ,851,74.0,IQR Method,High (2.0× IQR),
4ᵗʰ,493,71.0,IQR Method,Moderate (1.8× IQR),
5ᵗʰ,116,70.5,IQR Method,Moderate (1.8× IQR),
6ᵗʰ,—,70.0,IQR Method,Moderate (1.8× IQR),
7ᵗʰ,—,66.0,IQR Method,Moderate (1.6× IQR),
8ᵗʰ,—,65.0,IQR Method,Moderate (1.5× IQR),

0,1
Count,891
Unique,7 (≈)
Missing,0 (0.0%)
Mode,0
Mode %,68.2%
Empty strings,608

0,1
Entropy,1.339
Rare levels,2 (1.3%)
Top 5 coverage,98.7%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,0,608,68.2%,
2ⁿᵈ,1,209,23.5%,
3ʳᵈ,2,28,3.1%,
4ᵗʰ,4,18,2.0%,
5ᵗʰ,3,16,1.8%,
6ᵗʰ,8,7,0.8%,
7ᵗʰ,5,5,0.6%,

Original,lower(),strip()
—,—,—

0,1
Count,891
Unique,7 (≈)
Missing,0 (0.0%)
Mode,0
Mode %,76.1%
Empty strings,678

0,1
Entropy,1.128
Rare levels,4 (1.7%)
Top 5 coverage,99.4%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,0,678,76.1%,
2ⁿᵈ,1,118,13.2%,
3ʳᵈ,2,80,9.0%,
4ᵗʰ,3,5,0.6%,
5ᵗʰ,5,5,0.6%,
6ᵗʰ,4,4,0.4%,
7ᵗʰ,6,1,0.1%,

Original,lower(),strip()
—,—,—

0,1
Count,891
Unique,721
Missing,0 (0.0%)
Outliers,116 (13.0%)
Zeros,15 (1.7%)
Infinites,0 (0.0%)
Negatives,0 (0.0%)

0,1
Min,0
Q1 (P25),7.91
Median,14.45
Mean,32.2
Q3 (P75),31
Max,512.3
Processed bytes,7.1 KB (≈)

0,1
Mean,32.2
Std Dev,49.69
Variance,2469
Std Error,1.665
Coeff. of Var,1.543
Geometric mean,18.98
IQR,23.09
MAD,6.904
Skew,4.771
Kurtosis,33.12

0,1
Min,0
P1,0 (≈)
P5,7.225 (≈)
P10,7.55 (≈)
Q1 (P25),7.91
Median (P50),14.45
Q3 (P75),31
P90,77.96 (≈)
P95,112.1 (≈)
P99,249 (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,13.0,3,0.3%,
2ⁿᵈ,7.896,3,0.3%,
3ʳᵈ,30.0,1,0.1%,
4ᵗʰ,7.75,1,0.1%,
5ᵗʰ,8.05,1,0.1%,

Index,Value
271,0
263,0
732,0
466,0
302,0

Index,Value
258,512.3
737,512.3
679,512.3
27,263.0
438,263.0

Rank,Index,Value,Method,Severity,Extremity
1ˢᵗ,258,512.3,IQR Method,Extreme (20.8× IQR),
2ⁿᵈ,258,512.3,MAD Method,Extreme (72.1× MAD),
3ʳᵈ,27,263.0,IQR Method,Extreme (10.0× IQR),
4ᵗʰ,27,263.0,MAD Method,Extreme (36.0× MAD),
5ᵗʰ,—,262.4,IQR Method,Extreme (10.0× IQR),
6ᵗʰ,—,262.4,MAD Method,Extreme (35.9× MAD),
7ᵗʰ,—,247.5,IQR Method,Extreme (9.4× IQR),
8ᵗʰ,—,247.5,MAD Method,Extreme (33.8× MAD),
9ᵗʰ,—,227.5,IQR Method,Extreme (8.5× IQR),
10ᵗʰ,—,227.5,MAD Method,Extreme (30.9× MAD),

0,1
Count,889
Unique,3 (≈)
Missing,2 (0.2%)
Mode,S
Mode %,72.4%
Empty strings,0

0,1
Entropy,1.097
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,S,644,72.4%,
2ⁿᵈ,C,168,18.9%,
3ʳᵈ,Q,77,8.7%,

Original,lower(),strip()
S,s,S
C,c,C
Q,q,Q

0,1
Count,891
Unique,3 (≈)
Missing,0 (0.0%)
Mode,Third
Mode %,55.1%
Empty strings,0

0,1
Entropy,1.439
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,Third,491,55.1%,
2ⁿᵈ,First,216,24.2%,
3ʳᵈ,Second,184,20.7%,

Original,lower(),strip()
Third,third,Third
First,first,First
Second,second,Second

0,1
Count,891
Unique,3 (≈)
Missing,0 (0.0%)
Mode,man
Mode %,60.3%
Empty strings,0

0,1
Entropy,1.282
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,man,537,60.3%,
2ⁿᵈ,woman,271,30.4%,
3ʳᵈ,child,83,9.3%,

Original,lower(),strip()
—,—,—

0,1
Count,891
Unique,2 (≈)
Missing,0 (0.0%)
Mode,True
Mode %,60.3%
Empty strings,0

0,1
Entropy,0.9694
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,True,537,60.3%,
2ⁿᵈ,False,354,39.7%,

Original,lower(),strip()
True,True,True
False,False,False

0,1
Count,203
Unique,7 (≈)
Missing,688 (77.2%)
Mode,C
Mode %,29.1%
Empty strings,0

0,1
Entropy,2.496
Rare levels,0 (0.0%)
Top 5 coverage,91.6%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,C,59,29.1%,
2ⁿᵈ,B,47,23.2%,
3ʳᵈ,D,33,16.3%,
4ᵗʰ,E,32,15.8%,
5ᵗʰ,A,15,7.4%,
6ᵗʰ,F,13,6.4%,
7ᵗʰ,G,4,2.0%,

Original,lower(),strip()
C,c,C
B,b,B
D,d,D
E,e,E
A,a,A
F,f,F

0,1
Count,889
Unique,3 (≈)
Missing,2 (0.2%)
Mode,Southampton
Mode %,72.4%
Empty strings,0

0,1
Entropy,1.097
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,Southampton,644,72.4%,
2ⁿᵈ,Cherbourg,168,18.9%,
3ʳᵈ,Queenstown,77,8.7%,

Original,lower(),strip()
Southampton,southampton,Southampton
Cherbourg,cherbourg,Cherbourg
Queenstown,queenstown,Queenstown

0,1
Count,891
Unique,2 (≈)
Missing,0 (0.0%)
Mode,no
Mode %,61.6%
Empty strings,0

0,1
Entropy,0.9607
Rare levels,0 (0.0%)
Top 5 coverage,100.0%
Label length (avg),
Length p90,—
Processed bytes,0.0 B (≈)

Rank,Value,Count,Frequency,Distribution
1ˢᵗ,no,549,61.6%,
2ⁿᵈ,yes,342,38.4%,

Original,lower(),strip()
—,—,—

0,1
Count,891
Missing,0 (0.0%)
Unique,2

0,1
True,537 (60.3%)
False,354 (39.7%)
Processed bytes,"1,023.0 B (≈)"

Value,Count,Frequency,Distribution
True,537,60.3%,
False,354,39.7%,


In [None]:
# Analyze the missing columns behavior and compare configurations
print("🔍 ANALYZING MISSING COLUMNS BEHAVIOR")
print("=" * 50)

# Show detailed missing data analysis
missing_counts = df.isnull().sum()
missing_pct = (missing_counts / len(df)) * 100
missing_with_data = missing_pct[missing_pct > 0].sort_values(ascending=False)

print(f"\n📊 Missing Data Analysis:")
print(f"• Total columns: {len(df.columns)}")
print(f"• Columns with missing data: {len(missing_with_data)}")
print(f"• Columns with ≥0.5% missing: {len(missing_with_data[missing_with_data >= 0.5])}")
print(f"• Columns with ≥5% missing: {len(missing_with_data[missing_with_data >= 5])}")
print(f"• Columns with ≥20% missing: {len(missing_with_data[missing_with_data >= 20])}")

print(f"\n🎯 Top 15 Missing Columns:")
for i, (col, pct) in enumerate(missing_with_data.head(15).items(), 1):
    severity = "🔴 HIGH" if pct >= 20 else "🟡 MEDIUM" if pct >= 5 else "🟢 LOW"
    print(f"{i:2d}. {col}: {pct:5.1f}% {severity}")

print(f"\n⚙️ Configuration Comparison:")
configs = [
    ("Default", default_config, "Shows top columns with ≥0.5% missing"),
    ("Strict", strict_config, "Shows only columns with ≥5% missing"),
    ("Lenient", lenient_config, "Shows columns with ≥0.1% missing")
]

for name, config, description in configs:
    print(f"• {name}: {description}")
    print(f"  - Threshold: {config.missing_columns_threshold_pct}%")
    print(f"  - Max initial: {config.missing_columns_max_initial}")
    print(f"  - Max expanded: {config.missing_columns_max_expanded}")

print(f"\n✅ Reports generated successfully!")
print(f"• Default: {len(default_report.html):,} characters")
print(f"• Strict: {len(strict_report.html):,} characters") 
print(f"• Lenient: {len(lenient_report.html):,} characters")

print(f"\n🎉 Key Features Demonstrated:")
print(f"✅ Dynamic display limits based on dataset size (100 columns)")
print(f"✅ Smart filtering of insignificant missing data")
print(f"✅ Expandable UI with 'Show X more...' functionality")
print(f"✅ Configurable thresholds and limits")
print(f"✅ Visual severity indicators (low/medium/high)")
print(f"✅ Responsive design and accessibility")

Report type: <class 'pysuricata.api.Report'>
HTML length: 1032058 characters
Stats keys: ['dataset', 'columns']
✅ Report generated successfully!


In [None]:
# Bonus: Compare with the original Titanic dataset to show the difference
print("🚢 BONUS: Comparing with Titanic dataset (small dataset behavior)")
print("=" * 60)

# Load the Titanic dataset
titanic_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
titanic_df = pd.read_csv(titanic_url)

print(f"📊 Titanic Dataset: {titanic_df.shape[0]:,} rows × {titanic_df.shape[1]:,} columns")

# Show missing data in Titanic
titanic_missing = titanic_df.isnull().sum()
titanic_missing_pct = (titanic_missing / len(titanic_df)) * 100
titanic_with_missing = titanic_missing_pct[titanic_missing_pct > 0].sort_values(ascending=False)

print(f"🔍 Columns with missing data: {len(titanic_with_missing)} out of {len(titanic_df.columns)}")
print(f"📈 Missing percentages: {titanic_with_missing.min():.1f}% - {titanic_with_missing.max():.1f}%")

print(f"\n🎯 Missing Columns in Titanic:")
for col, pct in titanic_with_missing.items():
    severity = "🔴 HIGH" if pct >= 20 else "🟡 MEDIUM" if pct >= 5 else "🟢 LOW"
    print(f"• {col}: {pct:5.1f}% {severity}")

# Generate Titanic report to show small dataset behavior
print(f"\n📊 Generating Titanic report...")
titanic_report = pysuricata.profile(titanic_df)
titanic_report.save_html("pandas_titanic_small_dataset.html")
print("✅ Saved: pandas_titanic_small_dataset.html")

print(f"\n🔄 COMPARISON SUMMARY:")
print(f"• Large Dataset (100 cols): Shows dynamic limits with expandable UI")
print(f"• Small Dataset (Titanic, {len(titanic_df.columns)} cols): Shows all missing columns")
print(f"• Both demonstrate intelligent adaptation to dataset size!")
print(f"• Check the HTML files to see the different behaviors!")
