cachevector · maskedsyntax · Sep 20, 2025
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -13,22 +13,20 @@ For new contributors, we suggest starting with Easy tasks.
 ### Fork and clone the repository
 ```bash
 git clone https://github.com/cachevector/hashprep.git
+cd hashprep
 ```
 
-### CLI
+### Create a virtual environment
 
 ```bash
-cd /hashprep/cli/
-python -m venv .venv
+uv venv
 source .venv/bin/activate
-
-pip install -r requirements.txt 
 ```
 
-### Run the CLI
+### Install in development mode
 
 ```bash
-python hashprep/cli/main.py --help
+uv sync --extra dev 
 ```
 
 <!-- To keep your fork updated:

diff --git a/README.md b/README.md
@@ -1,29 +1,28 @@
 <div align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" srcset="docs/assets/hashprep-wobg.svg" width="100">
-    <img alt="Shows an illustrated sun in light color mode and a moon with stars in dark color mode." src="docs/hashprep-dark.svg" width="100">
+    <img alt="HashPrep Logo" src="docs/assets/hashprep-dark.svg" width="100">
   </picture>
 
   <h1>HashPrep</h1>
   <p>
-    <b> Dataset Debugging Playground </b>
+    <b> Dataset Profiler & Debugger for Machine Learning </b>
   </p>
 
   <p align="center">
-    <!-- Deployment -->
-    <img src="https://img.shields.io/badge/Web%20Version-Self%20Hosted-0A66C2" />
-    <img src="https://img.shields.io/badge/CLI-Supported-orange" />
-    <!-- Stack -->
-    <img src="https://img.shields.io/badge/UI-Svelte-ff3e00?logo=svelte" />
-    <img src="https://img.shields.io/badge/Backend-FastAPI-009688?logo=fastapi" />
-    <img src="https://img.shields.io/badge/DB-Postgres-336791?logo=postgresql" />
+    <!-- Distribution -->
+    <!-- <img src="https://img.shields.io/pypi/v/hashprep?color=blue&label=PyPI" /> -->
+    <img src="https://img.shields.io/badge/PyPI-Coming%20Soon-blue" />
     <!-- License -->
     <img src="https://img.shields.io/badge/License-MIT-green" />
+    <img src="https://img.shields.io/badge/CLI-Supported-orange" />
+  </p>
+  <p>
     <!-- Features -->
     <img src="https://img.shields.io/badge/Feature-Dataset%20Quality%20Assurance-critical" />
     <img src="https://img.shields.io/badge/Feature-Preprocessing%20%2B%20Profiling-blueviolet" />
     <img src="https://img.shields.io/badge/Feature-Report%20Generation-3f4f75" />
-    <img src="https://img.shields.io/badge/Feature-AutoML%20Integration-success" />
+    <img src="https://img.shields.io/badge/Feature-Quick%20Fixes-success" />
   </p>
 </div>
 
@@ -32,9 +31,12 @@
 
 ## Overview
 
-**HashPrep** is an intelligent dataset debugging and preparation platform that acts as a comprehensive pre-training quality assurance tool for machine learning projects. Think of it as **"Pandas Profiling + ESLint + AutoML"** specifically designed for ML datasets.
+**HashPrep** is a Python library for intelligent dataset profiling and debugging that acts as a comprehensive pre-training quality assurance tool for machine learning projects.
+Think of it as **"Pandas Profiling + PyLint for datasets"**, designed specifically for machine learning workflows.
+
+It catches critical dataset issues before they derail your ML pipeline, explains the problems, and suggests context-aware fixes.  
+If you want, HashPrep can even apply those fixes for you automatically.
 
-The platform catches critical dataset issues before they derail your ML pipeline, automatically suggests fixes, and generates production-ready cleaning code - saving hours of manual data debugging and preparation work.
 
 ---
 
@@ -44,11 +46,11 @@ Key features include:
 
 - **Intelligent Profiling**: Detect missing values, skewed distributions, outliers, and data type inconsistencies.
 - **ML-Specific Checks**: Identify data leakage, dataset drift, class imbalance, and high-cardinality features.
-- **Automated Preparation**: Get context-aware suggestions for encoding, imputation, scaling, and transformations.
-- **Rich Reporting**: Generate interactive dashboards, statistical summaries, and exportable reports for collaboration.
-- **Production-Ready Pipelines**: Automatically create reproducible cleaning and preprocessing code that integrates seamlessly with ML workflows.
+- **Automated Preparation**: Get suggestions for encoding, imputation, scaling, and transformations, and optionally apply them automatically.
+- **Rich Reporting**: Generate statistical summaries and exportable reports for collaboration.
+- **Production-Ready Pipelines**: Output reproducible cleaning and preprocessing code that integrates seamlessly with ML workflows.
 
-HashPrep turns data debugging into a guided, automated process - saving time, improving model reliability, and standardizing best practices across teams.
+HashPrep turns dataset debugging into a guided, automated process - saving time, improving model reliability, and standardizing best practices across teams.
 
 ---
 

diff --git a/hashprep/__init__.py → __init__.py b/hashprep/__init__.py → __init__.py
diff --git a/hashprep/analyzer.py → analyzer.py b/hashprep/analyzer.py → analyzer.py
@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Dict, List, Optional
 import pandas as pd
 import hashlib
 from scipy.stats import chi2_contingency
@@ -22,9 +22,10 @@ class DatasetAnalyzer:
     Detects critical issues and warnings, generates report
     """
 
-    def __init__(self, df: pd.DataFrame):
+    def __init__(self, df: pd.DataFrame, target_col: Optional[str] = None):
         self.df = df
-        self.issues: Optional[str] = []
+        self.target_col = target_col
+        self.issues: List[Issues] = []
         self.summaries: Dict = {}
 
     def analyze(self) -> Dict:
@@ -42,12 +43,12 @@ def analyze(self) -> Dict:
         self._summarize_missing_values()
 
         # ---- Warnings and Critical Issues ----
-        self._check_data_leakage()
+        self._check_data_leakage(self.target_col)
         self._check_high_missing_values()
         self._check_empty_columns()
         self._check_single_value_columns()
-        self._check_target_leakage_patterns()
-        self._check_class_imbalance()
+        self._check_target_leakage_patterns(self.target_col)
+        self._check_class_imbalance(self.target_col)
         self._check_high_cardinality()
         self._check_duplicates()
         self._check_mixed_data_types()
@@ -217,8 +218,8 @@ def _summarize_datetime_column(self, col: str):
         stats = {
             "count": int(series.count()),
             "missing": int(self.df[col].isna().sum()),
-            "min": str(series.min()) if not series.empty() else None,
-            "max": str(series.max()) if not series.empty() else None,
+            "min": str(series.min()) if not series.empty else None,
+            "max": str(series.max()) if not series.empty else None,
             "year_counts": (
                 series.dt.year.value_counts().to_dict() if not series.empty else None
             ),
@@ -503,7 +504,6 @@ def _check_mixed_data_types(self):
 
     def _check_outliers(self, z_threshold: float = 4.0):
         """Flag numeric columns with extreme outliers based on Z-score"""
-        from scipy.stats import zscore
 
         numeric_df = self.df.select_dtypes(include="number").dropna()
         if numeric_df.empty:
@@ -536,7 +536,7 @@ def _check_feature_correlation(self, threshold: float = 0.95):
             (col, row, val)
             for row in upper.index
             for col, val in upper[row].dropna().items()
-            if val > threshold
+            if val > threshold and col != row
         ]
 
         for col1, col2, corr in correlated_pairs:
@@ -552,14 +552,6 @@ def _check_feature_correlation(self, threshold: float = 0.95):
             )
 
 
-        # Simple missingness heatmap structure (list of missing row indexes)
-        self.summaries["missing_patterns"] = {
-            col: self.df[self.df[col].isna()].index.tolist()
-            for col in self.df.columns
-            if self.df[col].isna().any()
-        }
-
-
     # =========================================================================
     # Generate Summary
     # =========================================================================