In [4]:
# =============================================
# CPU-Friendly AI Embeddings + Data Quality Notebook
# =============================================

# -------------------------------
# Step 0: Install required packages (run in terminal/colab if needed)
# -------------------------------
# pip install pandas numpy scikit-learn matplotlib sentence-transformers transformers

# -------------------------------
# Step 1: Import libraries
# -------------------------------
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import matplotlib.pyplot as plt
from datetime import datetime
import os

# -------------------------------
# Step 2: Load Cleaned Dataset
# -------------------------------
DATA_PATH = r"C:\Users\Antra Tiwari\OneDrive\Desktop\Autonomous data cleaning\data\processed\train_clean.csv"
df = pd.read_csv(DATA_PATH)
print(f"âœ… Cleaned dataset loaded. Shape: {df.shape}")

# -------------------------------
# Step 3: Generate AI Embeddings
# -------------------------------
# Use lightweight model for CPU
model = SentenceTransformer('all-MiniLM-L6-v2')

# Combine columns into a single text per row
text_data = df.astype(str).agg(' | '.join, axis=1)

# Generate embeddings
print("ðŸ§  Generating embeddings (CPU-friendly)...")
embeddings = model.encode(text_data, show_progress_bar=True)

print(f"âœ… Embeddings generated. Shape: {embeddings.shape}")

# -------------------------------
# Step 4: Compute similarity matrix (optional)
# -------------------------------
# Example: Compute cosine similarity between all rows
similarity_matrix = cosine_similarity(embeddings)
print(f"âœ… Cosine similarity matrix computed. Shape: {similarity_matrix.shape}")

# -------------------------------
# Step 5: Basic Data Quality Metrics
# -------------------------------
completeness = df.notnull().mean().mean() * 100
duplicate_rows = df.duplicated().sum()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

print(f"ðŸŽ¯ Completeness: {completeness:.2f}%")
print(f"ðŸŽ¯ Duplicate rows: {duplicate_rows}")

# -------------------------------
# Step 6: Generate AI Data Quality Report
# -------------------------------
generator = pipeline("text-generation", model="distilgpt2")

metrics_summary = f"""
Dataset shape: {df.shape}
Completeness: {completeness:.2f}%
Duplicate rows: {duplicate_rows}
Numeric Columns: {numeric_cols}
"""

report_prompt = f"""
You are a data-quality analyst.
Based on the dataset metrics below, generate a concise 3-paragraph report:
Highlight potential issues, provide insights, and give actionable recommendations.
Metrics:
{metrics_summary}
"""

ai_report = generator(report_prompt, max_new_tokens=250, temperature=0.7, pad_token_id=50256)[0]['generated_text']

# -------------------------------
# Step 7: Save AI Report and Embeddings
# -------------------------------
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path_txt = os.path.join(output_dir, f"ai_data_quality_report_{timestamp}.txt")
embeddings_path = os.path.join(output_dir, f"embeddings_{timestamp}.npy")

with open(report_path_txt, "w", encoding="utf-8") as f:
    f.write(ai_report)

np.save(embeddings_path, embeddings)

print(f"ðŸ’¾ AI report saved at: {report_path_txt}")
print(f"ðŸ’¾ Embeddings saved at: {embeddings_path}")
print("ðŸŽ‰ Notebook completed successfully!")


âœ… Cleaned dataset loaded. Shape: (891, 15)




ðŸ§  Generating embeddings (CPU-friendly)...


Batches:   0%|          | 0/28 [00:00<?, ?it/s]

âœ… Embeddings generated. Shape: (891, 384)
âœ… Cosine similarity matrix computed. Shape: (891, 891)
ðŸŽ¯ Completeness: 100.00%
ðŸŽ¯ Duplicate rows: 0




ðŸ’¾ AI report saved at: outputs\ai_data_quality_report_20251007_233822.txt
ðŸ’¾ Embeddings saved at: outputs\embeddings_20251007_233822.npy
ðŸŽ‰ Notebook completed successfully!
