In [1]:
# Import library yang dibutuhkan
import pandas as pd
import numpy as np 
import os

# --- Konfigurasi Nama Kolom dan Path File ---
NAMA_FILE_CSV = 'histori_maintenance.csv' # Pastikan sudah ada kolom aircraft_type
NAMA_FILE_PROCESSED_CSV = 'processed_data_v3.csv' # Versi baru untuk file yang diproses

# Definisikan nama-nama kolom aktual
COLUMN_AIRCRAFT_TYPE = 'aircraft_type' # Kolom baru
COLUMN_FINDING_DESC = 'finding_description'
COLUMN_ACTION_TAKEN = 'action_taken'
COLUMN_WORK_CENTRE = 'work_centre'
COLUMN_PLANT = 'Plant'
COLUMN_ORDER = 'Order'
COLUMN_MATERIALS = 'materials_required'
COLUMN_MAN_HOURS = 'man_hours'


# Path ke file data mentah dan folder data yang diproses
PATH_RAW_DATA = os.path.join('..', 'data', 'raw', NAMA_FILE_CSV)
PATH_PROCESSED_DATA_DIR = os.path.join('..', 'data', 'processed')
PATH_PROCESSED_FILE = os.path.join(PATH_PROCESSED_DATA_DIR, NAMA_FILE_PROCESSED_CSV)

# --- 1. Memuat Data Mentah ---
print(f"Mencoba memuat data dari: {PATH_RAW_DATA}")
try:
    df = pd.read_csv(PATH_RAW_DATA)
    print("Data CSV berhasil dimuat!")
    df.columns = df.columns.str.strip() # Membersihkan spasi nama kolom
    print(f"Nama kolom terdeteksi: {df.columns.tolist()}")

    # Validasi keberadaan kolom baru
    if COLUMN_AIRCRAFT_TYPE not in df.columns:
        print(f"PERINGATAN: Kolom krusial '{COLUMN_AIRCRAFT_TYPE}' tidak ditemukan di CSV. Aplikasi tidak akan berfungsi dengan benar.")
    else:
        print(f"Kolom '{COLUMN_AIRCRAFT_TYPE}' berhasil ditemukan.")

except FileNotFoundError:
    print(f"ERROR: File tidak ditemukan di {PATH_RAW_DATA}")
    df = pd.DataFrame()
except Exception as e:
    print(f"Terjadi error saat memuat CSV: {e}")
    df = pd.DataFrame()

if not df.empty:
    print("\n--- 2. Pembersihan Data Awal ---")
    
    # a. Menangani Nilai Kosong (Missing Values)
    critical_cols_for_dropna = [COLUMN_AIRCRAFT_TYPE, COLUMN_FINDING_DESC, COLUMN_ACTION_TAKEN]
    print(f"Jumlah baris sebelum menangani NaN di kolom krusial: {len(df)}")
    df.dropna(subset=critical_cols_for_dropna, inplace=True)
    print(f"Jumlah baris setelah menghapus NaN di {critical_cols_for_dropna}: {len(df)}")

    # b. Memastikan Tipe Data yang Benar
    try:
        df[COLUMN_MAN_HOURS] = pd.to_numeric(df[COLUMN_MAN_HOURS], errors='coerce')
        df.dropna(subset=[COLUMN_MAN_HOURS], inplace=True)

        string_columns_to_clean = [
            COLUMN_AIRCRAFT_TYPE, COLUMN_FINDING_DESC, COLUMN_ACTION_TAKEN, 
            COLUMN_WORK_CENTRE, COLUMN_MATERIALS, COLUMN_PLANT, COLUMN_ORDER
        ]

        for col in string_columns_to_clean:
            if col in df.columns: 
                df[col] = df[col].astype(str).str.lower().str.strip()
                df[col].replace('nan', np.nan, inplace=True) 
        
        print("Tipe data setelah konversi dan pembersihan teks dasar berhasil.")
        print("\nContoh data setelah pembersihan:")
        print(df.head())

    except KeyError as e:
        print(f"ERROR: Kolom tidak ditemukan saat konversi tipe data atau pembersihan teks: {e}")
        df = pd.DataFrame()
    except Exception as e:
        print(f"Terjadi error saat pembersihan data: {e}")
        df = pd.DataFrame()

if not df.empty:
    print("\n--- 3. Agregasi Data ---")
    
    def aggregate_to_list(series):
        cleaned_list = series.dropna().tolist()
        return cleaned_list if cleaned_list else []

    def aggregate_first_valid(series):
        first_valid = series.dropna().iloc[0] if not series.dropna().empty else np.nan
        return first_valid

    # Konfigurasi agregasi
    agg_config = {
        'aircraft_type': (COLUMN_AIRCRAFT_TYPE, aggregate_first_valid), # MENAMBAHKAN AIRCRAFT_TYPE
        'order_info': (COLUMN_ORDER, aggregate_first_valid),
        'materials_info': (COLUMN_MATERIALS, aggregate_first_valid),
        'rectification_steps': (COLUMN_ACTION_TAKEN, aggregate_to_list),
        'man_hours_per_step': (COLUMN_MAN_HOURS, aggregate_to_list),
        'work_centres_per_step': (COLUMN_WORK_CENTRE, aggregate_to_list),
        'plants_per_step': (COLUMN_PLANT, aggregate_to_list)
    }
    
    # Filter konfigurasi agar hanya berisi kolom yang ada di DataFrame
    valid_agg_config = {k: v for k, v in agg_config.items() if v[0] in df.columns}
    
    try:
        processed_df = df.groupby(COLUMN_FINDING_DESC).agg(**valid_agg_config).reset_index()

        print("Data berhasil diagregasi.")
        print("Contoh data setelah agregasi:")
        print(processed_df.head())
        print("\nKolom pada processed_df:")
        print(processed_df.columns.tolist())

        print("\n--- 4. Menyimpan Data yang Sudah Diproses ---")
        if not os.path.exists(PATH_PROCESSED_DATA_DIR):
            os.makedirs(PATH_PROCESSED_DATA_DIR)
        
        processed_df.to_csv(PATH_PROCESSED_FILE, index=False)
        print(f"Data yang sudah diproses berhasil disimpan di: {PATH_PROCESSED_FILE}")

    except Exception as e:
        print(f"Terjadi error saat agregasi atau penyimpanan data: {e}")
else:
    print("\nTidak ada data untuk diproses lebih lanjut karena DataFrame kosong.")

Mencoba memuat data dari: ..\data\raw\histori_maintenance.csv
Data CSV berhasil dimuat!
Nama kolom terdeteksi: ['finding_description', 'action_taken', 'work_centre', 'materials_required', 'man_hours', 'Plant', 'Order', 'aircraft_type']
Kolom 'aircraft_type' berhasil ditemukan.

--- 2. Pembersihan Data Awal ---
Jumlah baris sebelum menangani NaN di kolom krusial: 22892
Jumlah baris setelah menghapus NaN di ['aircraft_type', 'finding_description', 'action_taken']: 22712


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace('nan', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].replace('nan', np.nan, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

Tipe data setelah konversi dan pembersihan teks dasar berhasil.

Contoh data setelah pembersihan:
                                 finding_description  \
0           a330 9m-xxr eng #1 phase array inspectio   
1  during preliminary: panel 454jl 1 ea screw and...   
2  during preliminary: panel 454jl 1 ea screw and...   
3  during preliminary: rh eng scan light not ill ...   
4  during preliminary: bleeding cap valve brake #...   

                                        action_taken work_centre  \
0           a330 9m-xxr eng #1 phase array inspectio        w808   
1  complete screw and washer at panel 454jl ref i...    gah210a1   
2  install screw and washer the pylon-to-wing fil...    gah210a1   
3  pse replace rh engine scan light ref amm task ...    gah210e2   
4  complete plug at brake #8 ref ipc fig. 32-42-2...    gah210a1   

           materials_required  man_hours Plant      Order aircraft_type  
0                         NaN        9.0  wsnc  804422680          a330  
1  nas11