In [1]:
import polars as pl
import pandas as pd
from typing import Dict, List, Any, Tuple
import logging
from pathlib import Path
from ydata_profiling import ProfileReport
import json

In [5]:
logger = logging.getLogger(__name__)


class DataProfiler:
    """Analyzes datasets to identify quality issues and patterns."""
    
    def __init__(self, file_path: Path):
        """
        Initialize the profiler with a dataset.
        
        We use Polars for efficient data loading and initial analysis,
        then convert to pandas only when necessary for compatibility.
        """
        self.file_path = file_path
        self.df = pl.read_csv(file_path, ignore_errors=True)
        logger.info(f"Loaded dataset with shape: {self.df.shape}")
        
    def generate_basic_profile(self) -> Dict[str, Any]:
        """
        Generate basic statistics about the dataset.
        This helps us understand the scale and structure of our data.
        """
        profile = {
            "shape": self.df.shape,
            "columns": self.df.columns,
            "dtypes": {col: str(dtype) for col, dtype in 
                      zip(self.df.columns, self.df.dtypes)},
            "memory_usage_mb": self.df.estimated_size() / (1024 * 1024),
        }
        
        # Analyze each column
        column_profiles = {}
        for col in self.df.columns:
            col_data = self.df[col]
            column_profiles[col] = {
                "null_count": col_data.null_count(),
                "null_percentage": (col_data.null_count() / len(col_data) * 100),
                "unique_count": col_data.n_unique(),
                "dtype": str(col_data.dtype),
            }
            
            # Add statistics for numeric columns
            if col_data.dtype in [pl.Int64, pl.Float64]:
                stats = {
                    "mean": col_data.mean(),
                    "std": col_data.std(),
                    "min": col_data.min(),
                    "max": col_data.max(),
                }
                column_profiles[col].update(stats)
                
        profile["column_profiles"] = column_profiles
        return profile
    
    def identify_data_quality_issues(self) -> Dict[str, List[str]]:
        """
        Identify potential data quality issues.
        This is where we start to understand what rules we might need.
        """
        issues = {
            "missing_values": [],
            "potential_duplicates": [],
            "inconsistent_formats": [],
            "outliers": [],
            "data_type_issues": []
        }
        
        # Check for missing values
        for col in self.df.columns:
            null_pct = (self.df[col].null_count() / len(self.df)) * 100
            if null_pct > 0:
                issues["missing_values"].append(
                    f"{col}: {null_pct:.2f}% missing"
                )
        
        # Check for potential duplicates (simplified check)
        duplicate_count = len(self.df) - len(self.df.unique())
        if duplicate_count > 0:
            issues["potential_duplicates"].append(
                f"Found {duplicate_count} potential duplicate rows"
            )
        
        # Check for inconsistent string formats
        for col in self.df.columns:
            if self.df[col].dtype == pl.Utf8:
                # Check for mixed case
                sample = self.df[col].drop_nulls().head(100)
                has_mixed_case = any(
                    s != s.lower() and s != s.upper() 
                    for s in sample if s is not None
                )
                if has_mixed_case:
                    issues["inconsistent_formats"].append(
                        f"{col}: Mixed case detected"
                    )
        
        return issues
    
    def generate_detailed_report(self, output_path: str = "../output/data_profile.html"):
        """
        Generate a comprehensive HTML report using ydata-profiling.
        This gives us a visual understanding of our data quality issues.
        """
        # Convert to pandas for ydata-profiling compatibility
        df_pandas = self.df.to_pandas()
        
        profile = ProfileReport(
            df_pandas, 
            title="Retail Sales Data Quality Report",
            explorative=True
        )
        
        profile.to_file(output_path)
        logger.info(f"Detailed report saved to {output_path}")
        
        return profile


if __name__ == "__main__":
    # Test the profiler
    profiler = DataProfiler(Path("../data/retail_store_sales.csv"))
    
    # Get basic profile
    basic_profile = profiler.generate_basic_profile()
    print(json.dumps(basic_profile, indent=2, default=str))
    
    # Identify issues
    issues = profiler.identify_data_quality_issues()
    print("\nData Quality Issues:")
    for issue_type, issue_list in issues.items():
        if issue_list:
            print(f"\n{issue_type}:")
            for issue in issue_list:
                print(f"  - {issue}")
    
    # Generate detailed report
    profiler.generate_detailed_report()

{
  "shape": [
    12575,
    11
  ],
  "columns": [
    "Transaction ID",
    "Customer ID",
    "Category",
    "Item",
    "Price Per Unit",
    "Quantity",
    "Total Spent",
    "Payment Method",
    "Location",
    "Transaction Date",
    "Discount Applied"
  ],
  "dtypes": {
    "Transaction ID": "String",
    "Customer ID": "String",
    "Category": "String",
    "Item": "String",
    "Price Per Unit": "Float64",
    "Quantity": "Float64",
    "Total Spent": "Float64",
    "Payment Method": "String",
    "Location": "String",
    "Transaction Date": "String",
    "Discount Applied": "Boolean"
  },
  "memory_usage_mb": 1.1220464706420898,
  "column_profiles": {
    "Transaction ID": {
      "null_count": 0,
      "null_percentage": 0.0,
      "unique_count": 12575,
      "dtype": "String"
    },
    "Customer ID": {
      "null_count": 0,
      "null_percentage": 0.0,
      "unique_count": 25,
      "dtype": "String"
    },
    "Category": {
      "null_count": 0,
      "null_pe

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/11 [00:00<?, ?it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 24.34it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]