In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

class AutoEDA:
    def __init__(self, filepath=None, dataframe=None):
        """
        Initialize with either a CSV filepath or a pandas dataframe.
        """
        if filepath:
            self.df = pd.read_csv(filepath)
        elif dataframe is not None:
            self.df = dataframe.copy()
        else:
            raise ValueError("Provide a filepath or a dataframe")
        print("Data loaded successfully. Shape:", self.df.shape)
    
    def profile(self):
        """
        Print basic info and missing values summary.
        """
        print("\nData Info:")
        print(self.df.info())
        print("\nData Describe:")
        print(self.df.describe(include="all"))
        print("\nMissing Values:")
        missing = self.df.isnull().sum()
        missing = missing[missing > 0].sort_values(ascending=False)
        print(missing)
    
    def handle_missing(self, drop_thresh=0.5):
        """
        Drops columns with too many missing values and imputes the rest.
        """
        print("\nHandling missing values...")
        # Drop columns with > drop_thresh missing
        self.df = self.df[self.df.columns[self.df.isnull().mean() < drop_thresh]]
        print("Dropped columns with more than", drop_thresh*100, "% missing values.")
        
        # Fill numeric
        for col in self.df.select_dtypes(include="number").columns:
            if self.df[col].isnull().sum() > 0:
                self.df[col].fillna(self.df[col].median(), inplace=True)
        
        # Fill categorical
        for col in self.df.select_dtypes(include="object").columns:
            if self.df[col].isnull().sum() > 0:
                self.df[col].fillna(self.df[col].mode()[0], inplace=True)
        print("Missing values handled.")
    
    def detect_outliers(self):
        """
        Prints the number of outliers detected per numerical column.
        """
        print("\nDetecting outliers...")
        for col in self.df.select_dtypes(include="number").columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            outlier_mask = (self.df[col] < Q1 - 1.5 * IQR) | (self.df[col] > Q3 + 1.5 * IQR)
            n_outliers = outlier_mask.sum()
            print(f"Column '{col}' has {n_outliers} outliers")
    
    def handle_outliers(self):
        """
        Caps numerical outliers at 1.5*IQR beyond Q1 and Q3.
        """
        print("\nHandling outliers by capping...")
        for col in self.df.select_dtypes(include="number").columns:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            lower = Q1 - 1.5 * (Q3 - Q1)
            upper = Q3 + 1.5 * (Q3 - Q1)
            self.df[col] = np.where(self.df[col] < lower, lower,
                                    np.where(self.df[col] > upper, upper, self.df[col]))
        print("Outliers handled.")
    
    def encode_categoricals(self):
        """
        Label-encodes categorical features.
        """
        print("\nEncoding categorical columns...")
        cat_cols = self.df.select_dtypes(include="object").columns
        for col in cat_cols:
            le = LabelEncoder()
            self.df[col] = le.fit_transform(self.df[col])
        print("Categorical columns encoded.")
    
    def scale_numeric(self):
        """
        Standardizes numerical features.
        """
        print("\nScaling numerical columns...")
        num_cols = self.df.select_dtypes(include="number").columns
        scaler = StandardScaler()
        self.df[num_cols] = scaler.fit_transform(self.df[num_cols])
        print("Numerical columns scaled.")
    
    def save_cleaned(self, output_path="cleaned_data.csv"):
        """
        Save the cleaned dataframe to CSV.
        """
        self.df.to_csv(output_path, index=False)
        print(f"Cleaned data saved to {output_path}.")
