In [None]:
# Library settings
import re
import numpy as np
import pandas as pd
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
from rich import print
from rich.table import Table
from rich.console import Console

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
terminal_width = shutil.get_terminal_size().columns

In [None]:
# Pengolahan Data
# 1 Editing
df = pd.read_csv("Electronic-Sales.csv")
print("----------Isi dataset---------")
print(df.head())
print(f"\n----------Dimensi dataset adalah: {df.shape}----------")

In [None]:
print("\n --------Cek Isi Data-------")
print(df.info())

numeric_columns = df.select_dtypes(include=[np.number]).columns
categorical_columns = df.select_dtypes(include=["string", "object"]).columns
print(f"Data numerik mencakup: {numeric_columns.to_list()}")
print(f"Data kategorikal mencakup: {categorical_columns.to_list()}")
print("Unique value pada data kategorikal:")
for column in categorical_columns:
    if column not in ["Purchase Date", "Add-ons Purchased"]:
        print(f"{column}: {df[column].unique()}")

In [None]:
print("\n ----------Cek Null----------")
print(df.isnull().sum())
print("\n")
print("Jumlah duplikasi: ", df.duplicated().sum())

In [None]:
print("---CEK OUTLIER---")


def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers


if len(numeric_columns) > 0:
    for column in numeric_columns:
        plt.figure(figsize=(10, 5))
        sns.boxplot(x=df[column])
        plt.title(f"Boxplot of {column}", fontsize=16)
        plt.xlabel(column, fontsize=12)
        plt.grid(True, linestyle="--", alpha=0.6)
        plt.show()

In [None]:
# Coding - Pengelompokkan data
# 1. Pengelompokkan Umur
def categorized_age(df):
    bins = [18, 31, 45, 60, np.inf]
    labels = [
        "18-30 (Young Adult)",
        "31-44 (Middle Age)",
        "45-59 (Pre Senior)",
        "60+ (Senior)",
    ]
    df["Age_Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
    return df


# 2. Pengelompokkan Pengeluaran
def categorized_spending(df):
    bins = [0, 500, 5000, 10000, np.inf]
    labels = [
        "Low Spender (0-$500)",
        "Medium Spender (100-$5k)",
        "High Spender ($5k-$10k)",
        "Luxury Spender ($10k+)",
    ]
    df["Spending_Category"] = pd.cut(
        df["Total Price"], bins=bins, labels=labels, right=False
    )
    return df


# 3. Pengelompokkan Harga Barang
def categorized_item_price(df):
    bins = [0, 100, 500, 1000, np.inf]
    labels = [
        "Budget Item (0-$100)",
        "Standard Item ($100-$500)",
        "Premium Item ($500-$1k)",
        "Luxury Item ($1k+)",
    ]
    df["Item_Price_Category"] = pd.cut(
        df["Unit Price"], bins=bins, labels=labels, right=False
    )
    return df


# 4. Pengelompokkan Spending Add-ons
def categorized_addons_spending(df):
    bins = [0, 50, 100, 200, np.inf]
    labels = [
        "Low Add-ons Spender (0-$50)",
        "Medium Add-ons Spender ($51-$100)",
        "High Add-ons Spender ($101-$200)",
        "Luxury Add-ons Spender ($200+)",
    ]
    df["Addons_Spending_Category"] = pd.cut(
        df["Add-on Total"], bins=bins, labels=labels, right=False
    )
    return df


def print_table(df_subset, title):
    table = Table(title=title, title_style="bold yellow", show_lines=True)
    for column in df_subset.columns:
        table.add_column(column, style="white bold", justify="center", no_wrap=True)
    for index, row in df_subset.iterrows():
        table.add_row(*[str(item) for item in row])
    console = Console()
    console.print(table)


# Tabel Hasil Coding/Pelabelan
df = categorized_age(df)
df = categorized_spending(df)
df = categorized_item_price(df)
df = categorized_addons_spending(df)
print_table(df[["Age", "Age_Group"]].head(7), "Age Coding")
print_table(df[["Total Price", "Spending_Category"]].head(7), "Total Price Coding")
print_table(df[["Unit Price", "Item_Price_Category"]].head(7), "Unit Price Coding")
print_table(
    df[["Add-on Total", "Addons_Spending_Category"]].head(7), "Add-on Total Coding"
)
df.to_csv("labeled_customer_data.csv", index=False)