# Import Library

In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import sys
import joblib

# Fungsi Pembantu: Mengelompokkan Kategori Langka

In [14]:
def group_rare_categories(df, column_name, top_n=50, other_category_name=None):
    """
    Mengelompokkan kategori yang jarang muncul dalam sebuah kolom
    menjadi satu kategori 'Other_column_name'.

    Args:
        df (pd.DataFrame): DataFrame input.
        column_name (str): Nama kolom kategorikal yang akan diproses.
        top_n (int): Jumlah kategori teratas yang akan dipertahankan.
                     Semua kategori lain akan digabungkan.
        other_category_name (str, optional): Nama kategori untuk nilai yang digabungkan.
                                             Defaultnya adalah f'Other_{column_name}'.
    Returns:
        pd.DataFrame: DataFrame dengan kolom yang sudah dikelompokkan.
    """
    if column_name not in df.columns:
        print(f"Peringatan: Kolom '{column_name}' tidak ditemukan untuk pengelompokan kategori langka.")
        return df

    if other_category_name is None:
        other_category_name = f'Other_{column_name}'

    value_counts = df[column_name].value_counts()
    top_categories = value_counts.nlargest(top_n).index
    df[column_name] = df[column_name].apply(lambda x: x if x in top_categories else other_category_name)
    
    print(f"  - Kolom '{column_name}': Kardinalitas dikurangi menjadi {df[column_name].nunique()} setelah mengelompokkan kategori langka (Top {top_n}).")
    return df

# DEFINISI JALUR FILE INPUT DAN OUTPUT

In [15]:
input_csv_base = '../data/base_processed_data.csv'
output_csv_unsupervised = '../data/unsupervised_ready_data.csv'
preprocessor_output_path_unsupervised = '../models/preprocessor_unsupervised.pkl'

print(f"Jalur Input CSV: {input_csv_base}")
print(f"Jalur Output CSV: {output_csv_unsupervised}")
print(f"Jalur Output Preprocessor: {preprocessor_output_path_unsupervised}")

Jalur Input CSV: ../data/base_processed_data.csv
Jalur Output CSV: ../data/unsupervised_ready_data.csv
Jalur Output Preprocessor: ../models/preprocessor_unsupervised.pkl


# Muat Data Dasar dan Analisis Kardinalitas Awal

In [16]:
print(f"--- MEMULAI PROSES PREPROCESSING UNSUPERVISED ---")
print(f"Python Executable: {sys.executable}")
print(f"Versi Pandas: {pd.__version__}")
print(f"Versi NumPy: {np.__version__}")

print(f"\nMemuat data dasar dari: {input_csv_base}")
df = pd.read_csv(input_csv_base)
print(f"Data dasar dimuat. Ukuran: {df.shape}")

print("\nKardinalitas kolom kategorikal sebelum pengelompokan:")
for col in df.select_dtypes(include='object').columns:
    print(f"Kolom '{col}': {df[col].nunique()} nilai unik.")

--- MEMULAI PROSES PREPROCESSING UNSUPERVISED ---
Python Executable: c:\Users\alyar\AppData\Local\Programs\Python\Python313\python.exe
Versi Pandas: 2.2.3
Versi NumPy: 2.2.6

Memuat data dasar dari: ../data/base_processed_data.csv
Data dasar dimuat. Ukuran: (558837, 14)

Kardinalitas kolom kategorikal sebelum pengelompokan:
Kolom 'make': 96 nilai unik.
Kolom 'model': 973 nilai unik.
Kolom 'trim': 1963 nilai unik.
Kolom 'body': 87 nilai unik.
Kolom 'transmission': 4 nilai unik.
Kolom 'state': 64 nilai unik.
Kolom 'color': 46 nilai unik.
Kolom 'interior': 17 nilai unik.
Kolom 'seller': 14263 nilai unik.


# Identifikasi Kolom

In [None]:
numerical_cols_raw = df.select_dtypes(include=np.number).columns.tolist() 
categorical_cols_raw = df.select_dtypes(include='object').columns.tolist() 

selected_features_for_clustering = ['mmr', 'sellingprice', 'year', 'odometer', 'condition', 'body', 'make']

selected_features_for_clustering = [f for f in selected_features_for_clustering if f in df.columns]

df_unsupervised = df[selected_features_for_clustering].copy()

numerical_cols_unsupervised = [col for col in selected_features_for_clustering if col in numerical_cols_raw]
categorical_cols_unsupervised = [col for col in selected_features_for_clustering if col in categorical_cols_raw]

print(f"\nFitur yang dipilih untuk Clustering: {selected_features_for_clustering}")
print(f"Kolom Numerik untuk Clustering: {numerical_cols_unsupervised}")
print(f"Kolom Kategorikal untuk Clustering: {categorical_cols_unsupervised}")

print(f"\nDEBUG_INFO: Shape of df_unsupervised sebelum transformasi: {df_unsupervised.shape}")
print(f"DEBUG_INFO: Kolom di df_unsupervised: {df_unsupervised.columns.tolist()}")
print(f"DEBUG_INFO: Dtypes of df_unsupervised: \n{df_unsupervised.dtypes}")


Fitur yang dipilih untuk Clustering: ['mmr', 'sellingprice', 'year', 'odometer', 'condition', 'body', 'make']
Kolom Numerik untuk Clustering: ['mmr', 'sellingprice', 'year', 'odometer', 'condition']
Kolom Kategorikal untuk Clustering: ['body', 'make']

DEBUG_INFO: Shape of df_unsupervised sebelum transformasi: (558837, 7)
DEBUG_INFO: Kolom di df_unsupervised: ['mmr', 'sellingprice', 'year', 'odometer', 'condition', 'body', 'make']
DEBUG_INFO: Dtypes of df_unsupervised: 
mmr             float64
sellingprice    float64
year              int64
odometer        float64
condition       float64
body             object
make             object
dtype: object


# Pengelompokan Kategori Langka

In [None]:
print("\nMelakukan pengelompokan kategori langka secara LEBIH AGRESIF untuk unsupervised...")

# Hanya terapkan pengelompokan pada fitur yang akan digunakan untuk unsupervised
# Sesuaikan top_n sesuai kebutuhan untuk kolom 'make' dan 'body'
df = group_rare_categories(df, 'make', top_n=25) # Dari 96 menjadi ~25+1
df = group_rare_categories(df, 'body', top_n=10)  # Dari 87 menjadi ~10+1

print("\nKardinalitas kolom kategorikal setelah pengelompokan:")
for col in df.select_dtypes(include='object').columns:
    print(f"Kolom '{col}': {df[col].nunique()} nilai unik.")


Melakukan pengelompokan kategori langka secara LEBIH AGRESIF untuk unsupervised...
  - Kolom 'make': Kardinalitas dikurangi menjadi 26 setelah mengelompokkan kategori langka (Top 25).
  - Kolom 'body': Kardinalitas dikurangi menjadi 11 setelah mengelompokkan kategori langka (Top 10).

Kardinalitas kolom kategorikal setelah pengelompokan:
Kolom 'make': 26 nilai unik.
Kolom 'model': 973 nilai unik.
Kolom 'trim': 1963 nilai unik.
Kolom 'body': 11 nilai unik.
Kolom 'transmission': 4 nilai unik.
Kolom 'state': 64 nilai unik.
Kolom 'color': 46 nilai unik.
Kolom 'interior': 17 nilai unik.
Kolom 'seller': 14263 nilai unik.


# Seleksi Fitur untuk Unsupervised dan Identifikasi Kolom

In [None]:
# --- Pilih Fitur yang Relevan untuk Unsupervised Learning ---
selected_features_for_unsupervised = [
    'mmr', 'sellingprice', 'year', 'odometer', 'condition',
    'body', 'make'
]

df_unsupervised = df[selected_features_for_unsupervised].copy()

print(f"\nDataFrame untuk unsupervised learning setelah seleksi fitur: {df_unsupervised.shape}")
print(f"Kolom yang dipilih: {df_unsupervised.columns.tolist()}")

# --- Identifikasi Kolom Numerik dan Kategorikal dalam Fitur Terpilih ---
# sellingprice di sini adalah fitur, bukan target
unsupervised_numerical_cols = ['mmr', 'sellingprice', 'year', 'odometer', 'condition']
unsupervised_categorical_cols = ['body', 'make']

print(f"\nKolom Numerik untuk Unsupervised: {unsupervised_numerical_cols}")
print(f"Kolom Kategorikal untuk Unsupervised: {unsupervised_categorical_cols}")

print(f"\nDEBUG_INFO: Dtypes of df_unsupervised: \n{df_unsupervised.dtypes}")


DataFrame untuk unsupervised learning setelah seleksi fitur: (558837, 7)
Kolom yang dipilih: ['mmr', 'sellingprice', 'year', 'odometer', 'condition', 'body', 'make']

Kolom Numerik untuk Unsupervised: ['mmr', 'sellingprice', 'year', 'odometer', 'condition']
Kolom Kategorikal untuk Unsupervised: ['body', 'make']

DEBUG_INFO: Dtypes of df_unsupervised: 
mmr             float64
sellingprice    float64
year              int64
odometer        float64
condition       float64
body             object
make             object
dtype: object


# Buat dan Terapkan ColumnTransformer untuk Unsupervised

In [None]:
# --- Buat Preprocessing Pipeline dengan ColumnTransformer untuk Unsupervised ---
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

preprocessor_unsupervised = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, unsupervised_numerical_cols),
        ('cat', categorical_transformer, unsupervised_categorical_cols)
    ],
    remainder='passthrough' 
)

# --- Terapkan Transformasi Fitur ---
print("\nMenerapkan transformasi fitur dengan ColumnTransformer untuk unsupervised...")
X_unsupervised_processed_array = preprocessor_unsupervised.fit_transform(df_unsupervised)

print(f"DEBUG_INFO: Shape of X_unsupervised_processed_array setelah preprocessor: {X_unsupervised_processed_array.shape}")


Menerapkan transformasi fitur dengan ColumnTransformer untuk unsupervised...
DEBUG_INFO: Shape of X_unsupervised_processed_array setelah preprocessor: (558837, 40)


# Konversi ke Dense Array dan Dapatkan Nama Kolom

In [21]:
# --- Konversi Sparse Matrix ke Dense Array ---
print("\nMengonversi sparse matrix ke dense array (jika diperlukan)...")
if hasattr(X_unsupervised_processed_array, "toarray"):
    X_unsupervised_processed_dense = X_unsupervised_processed_array.toarray()
else:
    X_unsupervised_processed_dense = X_unsupervised_processed_array
print(f"DEBUG_INFO: Shape of X_unsupervised_processed_dense setelah konversi: {X_unsupervised_processed_dense.shape}")

# --- Dapatkan Nama Kolom Setelah Preprocessing ---
scaled_feature_names = unsupervised_numerical_cols
ohe_feature_names = preprocessor_unsupervised.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(unsupervised_categorical_cols)

# Pastikan tidak ada kolom yang tidak ditransformasi jika remainder='passthrough'
all_transformed_by_name = set(unsupervised_numerical_cols + unsupervised_categorical_cols)
remainder_feature_names = [col for col in df_unsupervised.columns if col not in all_transformed_by_name]


processed_unsupervised_feature_names = list(scaled_feature_names) + list(ohe_feature_names) + list(remainder_feature_names)

print(f"\nDEBUG_INFO: Jumlah scaled_feature_names: {len(scaled_feature_names)}")
print(f"DEBUG_INFO: Jumlah ohe_feature_names: {len(ohe_feature_names)}")
print(f"DEBUG_INFO: Jumlah remainder_feature_names: {len(remainder_feature_names)}")
print(f"DEBUG_INFO: Total processed_unsupervised_feature_names: {len(processed_unsupervised_feature_names)}")

# --- Validasi Akhir sebelum membuat DataFrame ---
if X_unsupervised_processed_dense.shape[1] != len(processed_unsupervised_feature_names):
    print(f"\n!!! PERINGATAN: Jumlah kolom tidak sesuai !!!")
    raise ValueError("Jumlah kolom dalam array yang diproses tidak sesuai dengan jumlah nama kolom yang diberikan.")


Mengonversi sparse matrix ke dense array (jika diperlukan)...
DEBUG_INFO: Shape of X_unsupervised_processed_dense setelah konversi: (558837, 40)

DEBUG_INFO: Jumlah scaled_feature_names: 5
DEBUG_INFO: Jumlah ohe_feature_names: 35
DEBUG_INFO: Jumlah remainder_feature_names: 0
DEBUG_INFO: Total processed_unsupervised_feature_names: 40


# Buat DataFrame, Optimasi Tipe Data, dan Simpan Hasil

In [22]:
# Buat DataFrame dari array fitur yang telah diproses
df_unsupervised_processed = pd.DataFrame(X_unsupervised_processed_dense, columns=processed_unsupervised_feature_names)

# --- OPTIMASI TIPE DATA UNTUK MENGURANGI UKURAN FILE ---
print("\nMelakukan optimasi tipe data untuk mengurangi ukuran memori dan file...")
initial_memory_usage = df_unsupervised_processed.memory_usage(deep=True).sum() / (1024**2) # MB
print(f"Ukuran memori awal DataFrame: {initial_memory_usage:.2f} MB")

for col in df_unsupervised_processed.columns:
    if df_unsupervised_processed[col].dtype == 'float64':
        df_unsupervised_processed[col] = df_unsupervised_processed[col].astype('float32')
    elif df_unsupervised_processed[col].dtype == 'int64':
        min_val = df_unsupervised_processed[col].min()
        max_val = df_unsupervised_processed[col].max()
        if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
            df_unsupervised_processed[col] = df_unsupervised_processed[col].astype('int8')
        elif min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max:
            df_unsupervised_processed[col] = df_unsupervised_processed[col].astype('int16')
        elif min_val >= np.iinfo(np.int32).min and max_val <= np.iinfo(np.int32).max:
            df_unsupervised_processed[col] = df_unsupervised_processed[col].astype('int32')

final_memory_usage = df_unsupervised_processed.memory_usage(deep=True).sum() / (1024**2) # MB
print(f"Ukuran memori akhir DataFrame: {final_memory_usage:.2f} MB")
print(f"Penghematan memori: {((initial_memory_usage - final_memory_usage) / initial_memory_usage) * 100:.2f}%")


Melakukan optimasi tipe data untuk mengurangi ukuran memori dan file...
Ukuran memori awal DataFrame: 170.54 MB
Ukuran memori akhir DataFrame: 85.27 MB
Penghematan memori: 50.00%


# Simpan Data

In [23]:
# --- Simpan Data yang Siap Digunakan (CSV) ---
print(f"Menyimpan data yang telah diproses ke: {output_csv_unsupervised} (CSV)")
df_unsupervised_processed.to_csv(output_csv_unsupervised, index=False)
print(f"Ukuran data yang disimpan (CSV): {df_unsupervised_processed.shape}")

# Tampilkan beberapa baris pertama untuk konfirmasi
print("\nContoh data yang telah diproses (5 baris pertama):")
df_unsupervised_processed.head()

Menyimpan data yang telah diproses ke: ../data/unsupervised_ready_data.csv (CSV)


Ukuran data yang disimpan (CSV): (558837, 40)

Contoh data yang telah diproses (5 baris pertama):


Unnamed: 0,mmr,sellingprice,year,odometer,condition,body_Coupe,body_Crew Cab,body_Hatchback,body_Minivan,body_Other_body,...,make_Lincoln,make_Mazda,make_Mercedes-Benz,make_Nissan,make_Other_make,make_Pontiac,make_Ram,make_Subaru,make_Toyota,make_Volkswagen
0,0.695349,0.809145,1.25063,-0.96786,-1.940793,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.726342,0.809145,1.25063,-1.103567,-1.940793,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.873079,1.680995,0.998541,-1.254557,1.072405,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.418517,1.450211,1.25063,-1.012003,0.771085,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.395938,5.476104,0.998541,-1.230022,0.921745,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Simpan Preprocessor dan Label Encoders (PKL)

In [None]:
unsupervised_preprocessing_objects = {
    'preprocessor': preprocessor_unsupervised,
    'processed_feature_names': processed_unsupervised_feature_names
}

print(f"Menyimpan objek preprocessor unsupervised ke: {preprocessor_output_path_unsupervised}")
joblib.dump(unsupervised_preprocessing_objects, preprocessor_output_path_unsupervised)
print("Preprocessor Unsupervised berhasil disimpan.")

Menyimpan objek preprocessor unsupervised ke: ../models/preprocessor_unsupervised.pkl
Preprocessor Unsupervised berhasil disimpan.
