In [4]:
!pip install transformers torch accelerate


Collecting transformers
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.4 kB ? eta -:--:--
     --------- ------------------------------ 10.2/41.4 kB ? eta -:--:--
     ------------------ ------------------- 20.5/41.4 kB 131.3 kB/s eta 0:00:01
     ---------------------------- --------- 30.7/41.4 kB 187.9 kB/s eta 0:00:01
     -------------------------------------- 41.4/41.4 kB 200.0 kB/s eta 0:00:00
Collecting torch
  Downloading torch-2.8.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.9.18-cp312-cp312-win_amd64.whl.metadata

ERROR: THESE PACKAGES DO NOT MATCH THE HASHES FROM THE REQUIREMENTS FILE. If you have updated the package versions, please update the hashes. Otherwise, examine the package contents carefully; someone may have tampered with them.
    unknown package:
        Expected sha256 2f4ac52f0130275d7517b03a33d2493bab3693c83dcfadf4f81688ea82147d2e
             Got        c4b999cf154cbfe01c11b39f415042ed607e3e3f31f0004fe250178b4d522d0e


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# ==========================================
# Day 5: AI Data Quality Report (Visual & AI)
# ==========================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from io import BytesIO
import base64
from transformers import pipeline

# -------------------------------
# Step 1: Load cleaned dataset
# -------------------------------
clean_path = r"C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\data\processed\train_clean.csv"
clean_df = pd.read_csv(clean_path)
print(f"âœ… Cleaned dataset shape: {clean_df.shape}")

# -------------------------------
# Step 2: Basic Data Quality Metrics
# -------------------------------
completeness = 100 * clean_df.notna().all(axis=1).sum() / len(clean_df)
duplicates = clean_df.duplicated().sum()

# Outlier detection using percentiles
outlier_summary = {}
for col in clean_df.select_dtypes(include=np.number).columns:
    low = clean_df[col].quantile(0.01)
    high = clean_df[col].quantile(0.99)
    outlier_summary[col] = {
        "below_1pct": (clean_df[col] < low).sum(),
        "above_99pct": (clean_df[col] > high).sum()
    }

print(f"ðŸŽ¯ Completeness: {completeness:.2f}%")
print(f"ðŸŽ¯ Duplicate rows: {duplicates}")
print(f"ðŸŽ¯ Outlier summary: {outlier_summary}")

# -------------------------------
# Step 3: Generate Visualizations (embedded)
# -------------------------------
visuals_html = ""
visuals_md = ""

for col in clean_df.select_dtypes(include=np.number).columns:
    plt.figure(figsize=(5, 3))
    clean_df[col].hist(bins=30, color='skyblue', edgecolor='black')
    plt.title(f"{col} Histogram")
    plt.xlabel(col)
    plt.ylabel("Count")
    
    # Save figure to memory
    buf = BytesIO()
    plt.savefig(buf, format="png")
    plt.close()
    buf.seek(0)
    
    # Convert to base64 for embedding
    img_b64 = base64.b64encode(buf.read()).decode('utf-8')
    
    # HTML embedding
    visuals_html += f'<h3>{col} Histogram</h3><img src="data:image/png;base64,{img_b64}" width="500"><br>'
    
    # Markdown embedding
    visuals_md += f"### {col} Histogram\n\n![{col} Histogram](data:image/png;base64,{img_b64})\n\n"

print("ðŸ“Š Visualizations generated and embedded!")

# -------------------------------
# Step 4: Load lightweight local AI
# -------------------------------
print("ðŸ§  Loading lightweight CPU model: distilgpt2...")
generator = pipeline("text-generation", model="distilgpt2")

# -------------------------------
# Step 5: Generate AI Data Quality Report
# -------------------------------
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
prompt = f"""
You are a data quality analyst. Based on the dataset metrics below, 
generate a professional, structured data-quality report in 3 concise paragraphs.

Dataset shape: {clean_df.shape}
Completeness: {completeness:.2f}%
Duplicate rows: {duplicates}
Numeric Columns Summary: {outlier_summary}

Provide insights and actionable recommendations.
"""

ai_report = generator(prompt, max_new_tokens=300, do_sample=True, temperature=0.7)[0]['generated_text']
print("âœ… AI Report Generated!")

# -------------------------------
# Step 6: Save report (TXT, Markdown, HTML)
# -------------------------------
report_dir = "reports"
os.makedirs(report_dir, exist_ok=True)

txt_path = os.path.join(report_dir, f"ai_data_quality_report_{timestamp}.txt")
md_path  = os.path.join(report_dir, f"ai_data_quality_report_{timestamp}.md")
html_path = os.path.join(report_dir, f"ai_data_quality_report_{timestamp}.html")

# TXT
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(ai_report)

# Markdown with embedded visuals
with open(md_path, "w", encoding="utf-8") as f:
    f.write(f"# AI Data Quality Report\n\n{ai_report}\n\n{visuals_md}")

# HTML with embedded visuals
with open(html_path, "w", encoding="utf-8") as f:
    f.write(f"<html><body><h1>AI Data Quality Report</h1><p>{ai_report}</p>{visuals_html}</body></html>")

print(f"ðŸ’¾ Reports saved to:\n- {txt_path}\n- {md_path}\n- {html_path}")


âœ… Cleaned dataset shape: (891, 15)
ðŸŽ¯ Completeness: 100.00%
ðŸŽ¯ Duplicate rows: 0
ðŸŽ¯ Outlier summary: {'PassengerId': {'below_1pct': np.int64(9), 'above_99pct': np.int64(9)}, 'Survived': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Pclass': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Age': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'SibSp': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Parch': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'Fare': {'below_1pct': np.int64(0), 'above_99pct': np.int64(9)}, 'outlier_zscore': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'outlier_iforest': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}, 'outlier_lstm': {'below_1pct': np.int64(0), 'above_99pct': np.int64(0)}}
ðŸ“Š Visualizations generated and embedded!
ðŸ§  Loading lightweight CPU model: distilgpt2...


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


âœ… AI Report Generated!
ðŸ’¾ Reports saved to:
- reports\ai_data_quality_report_20251006_234028.txt
- reports\ai_data_quality_report_20251006_234028.md
- reports\ai_data_quality_report_20251006_234028.html
