## Class Implementation

In [None]:
import keras
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    StandardScaler,
    MinMaxScaler,
)

from sklearn.model_selection import train_test_split

In [2]:
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch
import tempfile
import os

In [None]:
class Data:
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def loda_data_with_csv(self, path):
        self.data = pd.read_csv(path)

    def null_preprocessing(self):
        print("ðŸŸ¡ Missing Values Before Cleaning:")
        print(self.data.isna().sum())
        self.data.dropna(inplace=True)
        print("\nâœ… After Dropping Nulls:")
        print(self.data.isna().sum())

    def categorical_encoding(self, data: pd.DataFrame) -> pd.DataFrame:
        """_summary_

        Args:
            data (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """        
        df = data.copy()
        label_encoder = LabelEncoder()

        categorical_cols = df.select_dtypes(include=["object"]).columns

        print("ðŸŸ¢ CATEGORICAL_COLUMNS:")
        print(categorical_cols)

        for col in categorical_cols:
            df[col] = label_encoder.fit_transform(df[col])

        return df

    def bool_encoding(self, data: pd.DataFrame) -> pd.DataFrame:
        """_summary_

        Args:
            data (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """        
        df = data.copy()
        label_encoder = LabelEncoder()

        boolean_cols = df.select_dtypes(include=["bool"]).columns

        print("ðŸŸ¢ BOOLEAN_COLUMNS:")
        print(boolean_cols)

        for col in boolean_cols:
            df[col] = label_encoder.fit_transform(df[col])

        return df

    def numberical_scaling(self, data: pd.DataFrame) -> pd.DataFrame:
        """_summary_

        Args:
            data (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """        
        df = data.copy()
        scaler = StandardScaler()
        cols = df.columns

        scaled_array = scaler.fit_transform(df)
        df = pd.DataFrame(scaled_array, columns=cols)

        return df

    def get_corr_heatmap(self, save_path: str | None = None):
        """_summary_

        Args:
            save_path (str | None, optional): _description_. Defaults to None.
        """        
        plt.figure(figsize=(10, 8))
        numberical_cols = self.data.select_dtypes(include=["number"]).columns
        sns.heatmap(
            self.data[numberical_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f"
        )
        plt.title("Correlation Heatmap", fontsize=14)

        if save_path:
            plt.savefig(save_path, bbox_inches="tight")
            plt.close()
        else:
            plt.show()

    def get_train_test_split(self, target_label, test_size=0.2):
        """_summary_

        Args:
            target_label (_type_): _description_
            test_size (float, optional): _description_. Defaults to 0.2.
        """        
        if target_label == None:
            print("ðŸ”´ Error, target_label is none")
        X_total = self.data.drop(target_label, axis=1)
        y_total = self.data[target_label]

        X_train, X_test, y_train, y_test = train_test_split(
            X_total,
            y_total,
            test_size=test_size,
            random_state=42,
            stratify=target_label,
        )

    def EDA_terminal(self):
        """_summary_
        """        
        print("=" * 60)
        print("ðŸ“Š BASIC INFORMATION")
        print("=" * 60)
        print(self.data.info())
        print("\n")

        print("=" * 60)
        print("ðŸ“ˆ DESCRIPTIVE STATISTICS")
        print("=" * 60)
        print(self.data.describe().T)
        print("\n")

        print("=" * 60)
        print("ðŸŸ¡ MISSING VALUES")
        print("=" * 60)
        print(self.data.isna().sum())
        print("\n")

        # Categorical Analysis
        categorical_cols = self.data.select_dtypes(
            include=["object", "category"]
        ).columns
        if len(categorical_cols) > 0:
            print("=" * 60)
            print("ðŸ”  CATEGORICAL COLUMN DISTRIBUTIONS")
            print("=" * 60)
            for col in categorical_cols:
                print(f"\nâ–¶ {col}")
                print(self.data[col].value_counts())
                plt.figure(figsize=(6, 4))
                sns.countplot(x=col, data=self.data, palette="Set2")
                plt.title(f"Distribution of {col}")
                plt.xticks(rotation=45)
                plt.show()

        # Numerical Analysis
        numerical_cols = self.data.select_dtypes(include=["int64", "float64"]).columns
        if len(numerical_cols) > 0:
            print("=" * 60)
            print("ðŸ”¢ NUMERICAL COLUMN DISTRIBUTIONS")
            print("=" * 60)
            self.data[numerical_cols].hist(
                bins=20, figsize=(12, 8), color="skyblue", edgecolor="black"
            )
            plt.suptitle("Numerical Feature Distributions")
            plt.show()

            print("=" * 60)
            print("ðŸ”¥ CORRELATION HEATMAP")
            print("=" * 60)
            self.get_corr_heatmap()

    def EDA(self, output_pdf="EDA_Report.pdf"):
        """_summary_

        Args:
            output_pdf (str, optional): _description_. Defaults to "EDA_Report.pdf".
        """        
        print("ðŸ§  Generating EDA Report...")

        styles = getSampleStyleSheet()
        report = SimpleDocTemplate(output_pdf, pagesize=A4)
        elements = []

        # Title
        elements.append(
            Paragraph(
                "<b><font size=16>Exploratory Data Analysis Report</font></b>",
                styles["Title"],
            )
        )
        elements.append(Spacer(1, 12))

        # Basic Info
        elements.append(Paragraph("<b>ðŸ“Š Basic Information</b>", styles["Heading2"]))
        elements.append(Paragraph(f"Shape: {self.data.shape}", styles["Normal"]))
        elements.append(
            Paragraph(f"Columns: {list(self.data.columns)}", styles["Normal"])
        )
        elements.append(Spacer(1, 12))

        # Missing Values
        elements.append(Paragraph("<b>ðŸŸ¡ Missing Values</b>", styles["Heading2"]))
        nulls = self.data.isna().sum()
        nulls_html = nulls.to_frame("Missing Values").to_html(border=0)
        elements.append(Paragraph(nulls_html, styles["Normal"]))
        elements.append(Spacer(1, 12))

        # Descriptive Stats
        elements.append(
            Paragraph("<b>ðŸ“ˆ Descriptive Statistics</b>", styles["Heading2"])
        )
        desc_html = self.data.describe().to_html(border=0)
        elements.append(Paragraph(desc_html, styles["Normal"]))
        elements.append(Spacer(1, 12))

        # Temporary directory for charts
        tmp_dir = tempfile.mkdtemp()

        # Numerical Distributions
        num_cols = self.data.select_dtypes(include=["int64", "float64"]).columns
        if len(num_cols) > 0:
            elements.append(
                Paragraph(
                    "<b>ðŸ”¢ Numerical Feature Distributions</b>", styles["Heading2"]
                )
            )
            for col in num_cols:
                plt.figure(figsize=(5, 3))
                sns.histplot(self.data, kde=True, color="skyblue")
                plt.title(f"Distribution of {col}")
                chart_path = os.path.join(tmp_dir, f"{col}_hist.png")
                plt.savefig(chart_path, bbox_inches="tight")
                plt.close()
                elements.append(RLImage(chart_path, width=5 * inch, height=3 * inch))
                elements.append(Spacer(1, 12))

        # Categorical Distributions
        cat_cols = self.data.select_dtypes(include=["object", "category"]).columns
        if len(cat_cols) > 0:
            elements.append(
                Paragraph(
                    "<b>ðŸ”  Categorical Feature Distributions</b>", styles["Heading2"]
                )
            )
            for col in cat_cols:
                plt.figure(figsize=(5, 3))
                sns.countplot(x=col, data=self.data, palette="Set2")
                plt.title(f"Distribution of {col}")
                plt.xticks(rotation=45)
                chart_path = os.path.join(tmp_dir, f"{col}_count.png")
                plt.savefig(chart_path, bbox_inches="tight")
                plt.close()
                elements.append(RLImage(chart_path, width=5 * inch, height=3 * inch))
                elements.append(Spacer(1, 12))

        # Correlation Heatmap
        heatmap_path = os.path.join(tmp_dir, "corr_heatmap.png")
        self.get_corr_heatmap(save_path=heatmap_path)
        elements.append(Paragraph("<b>ðŸ”¥ Correlation Heatmap</b>", styles["Heading2"]))
        elements.append(RLImage(heatmap_path, width=5.5 * inch, height=4 * inch))

        # Build PDF
        report.build(elements)
        print(f"âœ… EDA Report successfully saved as '{output_pdf}'")