In [2]:
# ======================================================
# üß† Autonomous Data Quality Report Generator (Free AI)
# ======================================================
# Model: distilgpt2 (Lightweight CPU-friendly)
# Author: Antra Tiwari
# ======================================================

import os
import pandas as pd
import numpy as np
from datetime import datetime
from transformers import pipeline

# -------------------------------
# Step 1: Load cleaned dataset
# -------------------------------
clean_path = r"C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\data\processed\train_clean.csv"

if not os.path.exists(clean_path):
    raise FileNotFoundError(f"‚ùå File not found at: {clean_path}")

clean_df = pd.read_csv(clean_path)
print(f"‚úÖ Cleaned dataset shape: {clean_df.shape}")

# -------------------------------
# Step 2: Data quality metrics
# -------------------------------
completeness = clean_df.notnull().mean().mean() * 100
duplicates = clean_df.duplicated().sum()

# Detect outliers
numeric_cols = clean_df.select_dtypes(include=np.number).columns
outlier_summary = {}
for col in numeric_cols:
    q_low = clean_df[col].quantile(0.01)
    q_high = clean_df[col].quantile(0.99)
    below = (clean_df[col] < q_low).sum()
    above = (clean_df[col] > q_high).sum()
    outlier_summary[col] = {"below_1pct": below, "above_99pct": above}

print(f"üéØ Completeness: {completeness:.2f}%")
print(f"üéØ Duplicate rows: {duplicates}")
print(f"üéØ Outlier summary: {outlier_summary}")

# -------------------------------
# Step 3: Load AI model (local)
# -------------------------------
print("üß† Loading lightweight CPU model: distilgpt2 (fast & small)...")
generator = pipeline("text-generation", model="distilgpt2")

# -------------------------------
# Step 4: Generate AI report
# -------------------------------
prompt = f"""
You are a professional data quality analyst. 
Based on the dataset metrics below, generate a clear and structured 3-paragraph report 
with insights and actionable recommendations.

Dataset shape: {clean_df.shape}
Completeness: {completeness:.2f}%
Duplicate rows: {duplicates}
Numeric Columns Summary: {outlier_summary}
"""

ai_output = generator(
    prompt,
    max_new_tokens=350,
    do_sample=True,
    temperature=0.7,
    pad_token_id=50256
)[0]["generated_text"]

# -------------------------------
# Step 5: Save report files
# -------------------------------
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_dir = r"C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\reports"
os.makedirs(report_dir, exist_ok=True)

report_path_txt = os.path.join(report_dir, f"ai_data_quality_report_{timestamp}.txt")
report_path_md = os.path.join(report_dir, f"ai_data_quality_report_{timestamp}.md")
report_path_html = os.path.join(report_dir, f"ai_data_quality_report_{timestamp}.html")

report_content = f"""
# üßæ AI-Generated Data Quality Report

**Generated on:** {timestamp}  
**Dataset Shape:** {clean_df.shape}  
**Completeness:** {completeness:.2f}%  
**Duplicate Rows:** {duplicates}  

## üìä Outlier Summary
{outlier_summary}

---

## ü§ñ AI Insights
{ai_output}

---

‚úÖ **End of Report**
"""

# Save all versions safely (UTF-8 encoding fix)
for path in [report_path_txt, report_path_md, report_path_html]:
    with open(path, "w", encoding="utf-8") as f:
        f.write(report_content)

print(f"üíæ Report saved to:\n - {report_path_txt}\n - {report_path_md}\n - {report_path_html}")
print("\nüéâ All files written successfully in UTF-8 encoding!")



‚úÖ Cleaned dataset shape: (891, 15)
üéØ Completeness: 100.00%
üéØ Duplicate rows: 0
üéØ Outlier summary: {'PassengerId': {'below_1pct': np.int64(9), 'above_99pct': np.int64(9)}, 'Survived': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Pclass': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Age': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'SibSp': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Parch': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Fare': {'below_1pct': np.int64(0), 'above_99pct': np.int64(9)}, 'outlier_zscore': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'outlier_iforest': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'outlier_lstm': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}}
üß† Loading lightweight CPU model: distilgpt2 (fast & small)...





üíæ Report saved to:
 - C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\reports\ai_data_quality_report_20251007_193351.txt
 - C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\reports\ai_data_quality_report_20251007_193351.md
 - C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\reports\ai_data_quality_report_20251007_193351.html

üéâ All files written successfully in UTF-8 encoding!
