In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')
print("✅ Libraries berhasil diimport.")

✅ Libraries berhasil diimport.


PREPROCESSING

In [None]:
try:
    df_raw = pd.read_csv('../dataset/data_isolation_forest.csv', sep=';')

    id_vars = ['text', 'jenis']
    week_cols = [col for col in df_raw.columns if col not in id_vars]
    df_long = pd.melt(df_raw, id_vars=id_vars, value_vars=week_cols, var_name='week_range', value_name='sales')

    df_long['week_start'] = pd.to_datetime(df_long['week_range'].str.split(' - ').str[0], format='%d/%m/%Y')
    # df_long.to_csv('transformasi_data_if.csv', index=False, sep=';')
    
    print(f"✅ Data berhasil dimuat dan diproses.")
    print(f"   Shape data setelah diproses: {df_long.shape}")
    print(f"   Rentang waktu data: {df_long['week_start'].min().date()} hingga {df_long['week_start'].max().date()}")

except FileNotFoundError:
    print("❌ Error: File 'hasil_prediksi_obat.csv' tidak ditemukan.")
    exit()


✅ Data berhasil dimuat dan diproses.
   Shape data setelah diproses: (1029912, 5)
   Rentang waktu data: 2020-01-06 hingga 2022-12-26


FEATURE ENGINEERING

In [3]:
print("Memulai proses feature engineering...")

weekly_data = df_long.groupby('week_start')

weekly_features = weekly_data.agg(
    total_sales=('sales', 'sum'),
    num_products_sold=('sales', lambda x: (x > 0).sum()),
    max_single_product=('sales', 'max'),
    mean_sales=('sales', 'mean'),
    std_sales=('sales', 'std')
).fillna(0)

# Fitur Gini Coefficient 
def gini(arr):
    arr = np.sort(arr)
    n = arr.shape[0]
    if n == 0 or np.sum(arr) == 0:
        return 0
    index = np.arange(1, n + 1)
    return (np.sum((2 * index - n - 1) * arr)) / (n * np.sum(arr))

weekly_features['gini_coefficient'] = weekly_data['sales'].apply(gini)

# Fitur Week-on-Week Change
weekly_features['week_on_week_change'] = weekly_features['total_sales'].pct_change().fillna(0) * 100
weekly_features.replace([np.inf, -np.inf], 0, inplace=True) 

# Fitur Spike from Rolling Mean
rolling_mean = weekly_features['total_sales'].rolling(window=4, min_periods=1).mean()
weekly_features['spike_from_mean'] = (weekly_features['total_sales'] - rolling_mean) / rolling_mean
weekly_features.replace([np.inf, -np.inf], 0, inplace=True)
weekly_features['spike_from_mean'] = weekly_features['spike_from_mean'].fillna(0) * 100

print("Feature engineering selesai.")
print(f"Jumlah fitur yang dibuat: {len(weekly_features.columns)}")

# weekly_features.to_csv('feature_engineering_noduplikat.csv', sep=';')


Memulai proses feature engineering...
Feature engineering selesai.
Jumlah fitur yang dibuat: 8


In [4]:
# Standarisasi fitur
scaler = StandardScaler()
features_scaled = scaler.fit_transform(weekly_features)

features_scaled_df = pd.DataFrame(features_scaled, columns=weekly_features.columns, index=weekly_features.index)
features_scaled_df.to_csv('../dataset/preprocessed_if_data.csv', sep=';')

ANOMALY DETECTION

In [5]:
print("Memulai deteksi anomali untuk seluruh data...")
 
model = IsolationForest(contamination=0.1, random_state=42, n_estimators=100)
model.fit(features_scaled)

weekly_features['anomaly_flag'] = model.predict(features_scaled)
weekly_features['anomaly_score'] = model.decision_function(features_scaled)

# Filtering untuk lonjakan
all_anomalies = weekly_features[
    (weekly_features['anomaly_flag'] == -1) &
    (weekly_features['spike_from_mean'] > 0) &
    (weekly_features['week_on_week_change'] > 0)
].copy()

print(f"Analisis selesai: Ditemukan {len(all_anomalies)} anomali lonjakan penjualan di seluruh data.")

Memulai deteksi anomali untuk seluruh data...
Analisis selesai: Ditemukan 7 anomali lonjakan penjualan di seluruh data.


In [7]:
feature_cols = [
    'total_sales', 'num_products_sold', 'max_single_product', 'mean_sales', 
    'std_sales', 'gini_coefficient', 'week_on_week_change', 'spike_from_mean'
]
TOP_K_FEATURES = 3

def explain_instance(model, x_vec: np.ndarray, scaled_x_vec: np.ndarray):
    nfeat = len(feature_cols)
    contrib = np.zeros(nfeat, dtype=float)
    freq = np.zeros(nfeat, dtype=float)

    for est in model.estimators_:
        tree = est.tree_
        node = 0 

        while tree.feature[node] != -2:
            f_idx = tree.feature[node]
            thr = tree.threshold[node]
            
            scaled_xval = scaled_x_vec[0, f_idx]
            margin = abs(scaled_xval) 
            
            contrib[f_idx] += margin
            freq[f_idx] += 1.0
            
            if scaled_xval <= thr:
                node = tree.children_left[node]
            else:
                node = tree.children_right[node]

    avg_margin = np.divide(contrib, np.maximum(freq, 1.0))
    order = np.argsort(-avg_margin) 
    
    top_features = []
    for i in order[:TOP_K_FEATURES]:
        top_features.append(f"{feature_cols[i]} (kontribusi Z-score: {avg_margin[i]:.2f}, digunakan {int(freq[i])}x)")
        
    return "; ".join(top_features)

print("✅ Fungsi explain_instance (versi Z-score) siap digunakan.")

✅ Fungsi explain_instance (versi Z-score) siap digunakan.


In [8]:
if 'all_anomalies' in locals() and not all_anomalies.empty:
    print("Analisis Pemicu Anomali (Top 5 Jenis Obat)")
    print("="*50)

    for week_date, anomaly_data in all_anomalies.iterrows():
        
        sales_on_anomaly_week = df_long[df_long['week_start'] == week_date]
        top_5_jenis = sales_on_anomaly_week.groupby('jenis')['sales'].sum().nlargest(5)
        week_str = week_date.strftime('%d %B %Y')
        
        print(f"\nMinggu Anomali: {week_str}")
        print("-" * 35)
        
        if top_5_jenis.empty or top_5_jenis.sum() == 0:
            print("   Tidak ada penjualan yang signifikan tercatat.")
        else:
            print("   Kontributor Utama Anomali:")
            for jenis, sales in top_5_jenis.items():
                if sales > 0:
                    print(f"   - {jenis:<20}: {int(sales)} penjualan")
    print("\n" + "="*50)
    print("Analisis pemicu anomali selesai.")

else:
    print("Tidak ditemukan anomali, jadi tidak ada analisis pemicu yang dilakukan.")


Analisis Pemicu Anomali (Top 5 Jenis Obat)

Minggu Anomali: 13 January 2020
-----------------------------------
   Kontributor Utama Anomali:
   - Herbal              : 143 penjualan
   - Multivitamin        : 51 penjualan
   - Herbal, Multivitamin: 24 penjualan
   - Analgesik, Antipiretik: 5 penjualan
   - Dekongestan         : 5 penjualan

Minggu Anomali: 30 March 2020
-----------------------------------
   Kontributor Utama Anomali:
   - Herbal              : 331 penjualan
   - Multivitamin        : 158 penjualan
   - Herbal, Multivitamin: 91 penjualan
   - Analgesik, Antipiretik: 28 penjualan
   - Antiseptik          : 28 penjualan

Minggu Anomali: 21 June 2021
-----------------------------------
   Kontributor Utama Anomali:
   - Multivitamin        : 1419 penjualan
   - Herbal              : 624 penjualan
   - Herbal, Multivitamin: 143 penjualan
   - Analgesik, Antipiretik: 73 penjualan
   - Antasida, Multivitamin: 35 penjualan

Minggu Anomali: 28 June 2021
----------------------

In [9]:
jenis_stats = df_long.groupby('jenis')['sales'].agg(['mean', 'std']).reset_index()
jenis_stats.columns = ['jenis', 'mean_sales_jenis', 'std_sales_jenis']
jenis_stats['std_sales_jenis'] = jenis_stats['std_sales_jenis'].fillna(1) 

print("✅ Statistik baseline per jenis obat berhasil dihitung.")

if 'df_long' in locals():
    list_jenis_unik = df_long['jenis'].unique()
    print("Daftar semua jenis obat yang ada di dataset:")
    print(list(list_jenis_unik))

✅ Statistik baseline per jenis obat berhasil dihitung.
Daftar semua jenis obat yang ada di dataset:
['Ekspektoran, Herbal', 'Antiseptik', 'Antiseptik, Dekongestan', 'Antasida', 'Analgesik, Antipiretik', 'Analgesik, Antipiretik, Dekongestan', 'Herbal', 'Analgesik, Antasida, Antipiretik', 'Antidiare', 'Dekongestan, Ekspektoran', 'Herbal, Multivitamin', 'Ekspektoran', 'Dekongestan', 'Antidiare, Herbal', 'Multivitamin', 'Antiseptik, Herbal', 'Antiseptik, Multivitamin', 'Antidiare, Multivitamin', 'Antidiare, Herbal, Multivitamin', 'Antasida, Herbal', 'Antasida, Multivitamin', 'Analgesik', 'Analgesik, Antipiretik, Dekongestan, Ekspektoran', 'Dekongestan, Ekspektoran, Herbal', 'Analgesik, Antipiretik, Dekongestan, Ekspektoran, Herbal', 'Analgesik, Antipiretik, Multivitamin', 'Analgesik, Antipiretik, Ekspektoran', 'Analgesik, Dekongestan', 'Analgesik, Antipiretik, Dekongestan, Ekspektoran, Multivitamin', 'Multivitamin, Herbal']


In [10]:
print("\n\n==========================================================")
print("HASIL INVESTIGASI ANOMALI SECARA DETAIL")
print("==========================================================")

if all_anomalies.empty:
    print("Tidak ditemukan anomali signifikan yang merupakan lonjakan penjualan.")
else:
    all_anomalies.sort_index(inplace=True)
    normal_stats = weekly_features[weekly_features['anomaly_flag'] == 1].describe()
    
    # Hitung statistik baseline per jenis obat
    jenis_stats = df_long.groupby('jenis')['sales'].agg(['mean', 'std']).reset_index()
    jenis_stats.columns = ['jenis', 'mean_sales_jenis', 'std_sales_jenis']
    jenis_stats['std_sales_jenis'] = jenis_stats['std_sales_jenis'].fillna(1)

    for date, row in all_anomalies.iterrows():
        sales_on_anomaly_week = df_long[df_long['week_start'] == date]

        print(f"\n\nANOMALI TERDETEKSI PADA MINGGU: {date.strftime('%d %B %Y')}")
        print("----------------------------------------------------------")
        print("A. ANALISIS KUANTITATIF (Penyimpangan dari Norma)")
        print("----------------------------------------------------------")
        for feature in feature_cols:
            anomaly_val = row[feature]
            mean_val = normal_stats.loc['mean', feature]
            std_val = normal_stats.loc['std', feature]
            keterangan = f"({((anomaly_val - mean_val) / std_val):.2f} std dev di atas rata-rata)" if std_val > 0 else ""
            print(f"   - {feature:<22}: {anomaly_val:10.2f} {keterangan}")

        print("\n----------------------------------------------------------")
        print("B. ANALISIS KONTRIBUTOR (Jenis Obat dengan Lonjakan Paling Ekstrem)")
        print("----------------------------------------------------------")
        sales_per_jenis_anomaly = sales_on_anomaly_week.groupby('jenis')['sales'].sum().reset_index()
        sales_per_jenis_anomaly.columns = ['jenis', 'sales_anomaly']
        comparison_df = pd.merge(sales_per_jenis_anomaly, jenis_stats, on='jenis', how='left').fillna(0)
        comparison_df['z_score'] = (comparison_df['sales_anomaly'] - comparison_df['mean_sales_jenis']) / (comparison_df['std_sales_jenis'].replace(0, 1))
        top_5_lonjakan = comparison_df.sort_values(by='z_score', ascending=False).head(5)
        print("   Jenis obat dengan lonjakan paling signifikan (relatif terhadap normalnya):")
        for idx, sub_row in top_5_lonjakan.iterrows():
            print(f"   - {sub_row['jenis']:<20}: Z-score = {sub_row['z_score']:.2f} (Terjual {int(sub_row['sales_anomaly'])}, Rata-rata {sub_row['mean_sales_jenis']:.2f})")

        if 'model' in locals() and 'explain_instance' in locals():
            print("\n----------------------------------------------------------")
            print("C. ANALISIS MEKANISME MODEL (Tree-Path Analysis)")
            print("----------------------------------------------------------")
            x_vector_original = weekly_features.loc[date, feature_cols].values
            x_vector_scaled = scaler.transform(x_vector_original.reshape(1, -1))
            top_features_explanation = explain_instance(model, x_vector_original, x_vector_scaled)
            print(f"   Fitur utama yang mengisolasi anomali ini adalah:")
            for explanation in top_features_explanation.split('; '):
                print(f"     - {explanation}")
        
        print("\n----------------------------------------------------------")
        print("D. RINCIAN KONTEKSTUAL PER FITUR")
        print("----------------------------------------------------------")

        previous_week_date = date - pd.Timedelta(weeks=1)
        previous_week_features = weekly_features.loc[previous_week_date] if previous_week_date in weekly_features.index else None

        # 1. Total Sales
        if previous_week_features is not None:
            print(f"1. Total Sales: Naik menjadi {row['total_sales']:.0f} dari {previous_week_features['total_sales']:.0f} pada minggu sebelumnya.")
        else:
            print(f"1. Total Sales: {row['total_sales']:.0f} (tidak ada data minggu sebelumnya).")

        # 2. Num Products Sold
        print(f"2. Num Products Sold: Terdapat {int(row['num_products_sold'])} jenis produk berbeda yang terjual.")
        
        # 3. Max Single Product
        top_product = sales_on_anomaly_week.loc[sales_on_anomaly_week['sales'].idxmax()]
        
        product_name = top_product['text']
        max_words = 20
        words = product_name.split()
        if len(words) > max_words:
            shortened_name = ' '.join(words[:max_words]) + '...'
        else:
            shortened_name = product_name
        
        print(f"3. Max Single Product: Produk paling laku adalah '{shortened_name}' dengan penjualan {int(top_product['sales'])} unit.")

        # 4. Week on Week Change
        print(f"4. Week on Week Change: Perubahannya adalah {row['week_on_week_change']:.2f}%.")
        if previous_week_features is not None:
            print(f"   (Dihitung dari {previous_week_features['total_sales']:.0f} -> {row['total_sales']:.0f})")

        # 5. Spike from Mean
        window_dates = [date - pd.Timedelta(weeks=i) for i in range(3, -1, -1) if date - pd.Timedelta(weeks=i) in weekly_features.index]
        window_sales = weekly_features.loc[window_dates, 'total_sales']
        rolling_mean_val = window_sales.mean()
        print(f"5. Spike from Mean: Lonjakannya {row['spike_from_mean']:.2f}% di atas rata-rata 4 minggu terakhir ({rolling_mean_val:.2f}).")
        print(f"   (Penjualan 4 minggu terakhir: {list(window_sales.tolist())})")

        # 6. Std Sales
        print(f"6. Std Sales: Nilai standar deviasi penjualan adalah {row['std_sales']:.2f}.")

        # 7. Mean Sales
        if previous_week_features is not None:
            print(f"7. Mean Sales: Rata-rata penjualan per produk adalah {row['mean_sales']:.2f}, dibandingkan {previous_week_features['mean_sales']:.2f} pada minggu sebelumnya.")
        else:
            print(f"7. Mean Sales: Rata-rata penjualan per produk adalah {row['mean_sales']:.2f}.")

        # 8. Gini Coefficient
        print(f"8. Gini Coefficient: Nilai koefisien Gini adalah {row['gini_coefficient']:.2f}.")



HASIL INVESTIGASI ANOMALI SECARA DETAIL


ANOMALI TERDETEKSI PADA MINGGU: 13 January 2020
----------------------------------------------------------
A. ANALISIS KUANTITATIF (Penyimpangan dari Norma)
----------------------------------------------------------
   - total_sales           :     247.00 (-2.32 std dev di atas rata-rata)
   - num_products_sold     :     175.00 (-2.09 std dev di atas rata-rata)
   - max_single_product    :       9.00 (-1.40 std dev di atas rata-rata)
   - mean_sales            :       0.04 (-2.32 std dev di atas rata-rata)
   - std_sales             :       0.28 (-2.74 std dev di atas rata-rata)
   - gini_coefficient      :       0.98 (1.78 std dev di atas rata-rata)
   - week_on_week_change   :     168.48 (9.75 std dev di atas rata-rata)
   - spike_from_mean       :      45.72 (3.99 std dev di atas rata-rata)

----------------------------------------------------------
B. ANALISIS KONTRIBUTOR (Jenis Obat dengan Lonjakan Paling Ekstrem)
-----------------------

In [None]:
print("⏳ Memulai visualisasi...")
plt.style.use('seaborn-v0_8-whitegrid')
feature_groups = [
    ('total_sales', 'num_products_sold'),
    ('max_single_product', 'gini_coefficient'),
    ('week_on_week_change', 'spike_from_mean'),
    ('mean_sales', 'std_sales')
]
colors = [
    ('royalblue', 'green'),
    ('purple', 'orange'),
    ('darkred', 'darkcyan'),
    ('brown', 'magenta')
]
ylabels = [
    ('Total Sales (Log Scale)', 'Jumlah Produk Terjual'),
    ('Max Penjualan 1 Produk', 'Koefisien Gini'),
    ('Perubahan WoW (%)', 'Lonjakan dari Rata-rata (%)'),
    ('Rata-rata Penjualan', 'Std Dev Penjualan')
]

for group, color_pair, ylabel_pair in zip(feature_groups, colors, ylabels):
    feature1, feature2 = group
    color1, color2 = color_pair
    ylabel1, ylabel2 = ylabel_pair
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 10), sharex=True)
    fig.suptitle(f'Analisis Fitur {feature1} & {feature2}', fontsize=20, weight='bold')

    ax1.plot(weekly_features.index, weekly_features[feature1], color=color1, label=feature1)
    if not all_anomalies.empty:
        ax1.scatter(all_anomalies.index, all_anomalies[feature1], color='red', s=100, marker='X', zorder=5, label='Anomali')
    ax1.set_ylabel(ylabel1, fontsize=12)
    ax1.legend(loc='upper left')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)
    if feature1 == 'total_sales':
        ax1.set_yscale('log')

    ax2.plot(weekly_features.index, weekly_features[feature2], color=color2, label=feature2)
    if not all_anomalies.empty:
        ax2.scatter(all_anomalies.index, all_anomalies[feature2], color='red', s=100, marker='X', zorder=5)
    ax2.set_ylabel(ylabel2, fontsize=12)
    ax2.set_xlabel('Tanggal', fontsize=12)
    ax2.legend(loc='upper left')
    ax2.grid(True, which='both', linestyle='--', linewidth=0.5)

    file_name = f'visualisasi_{feature1}_vs_{feature2}.png'
    plt.tight_layout(rect=[0, 0.03, 1, 0.96])
    plt.savefig(file_name, dpi=300)
    plt.close(fig) 
    print(f"✅ Visualisasi berhasil disimpan sebagai '{file_name}'")

⏳ Memulai pembuatan beberapa file visualisasi...
✅ Visualisasi berhasil disimpan sebagai 'visualisasi_total_sales_vs_num_products_sold.png'
✅ Visualisasi berhasil disimpan sebagai 'visualisasi_max_single_product_vs_gini_coefficient.png'
✅ Visualisasi berhasil disimpan sebagai 'visualisasi_week_on_week_change_vs_spike_from_mean.png'
✅ Visualisasi berhasil disimpan sebagai 'visualisasi_mean_sales_vs_std_sales.png'


In [13]:
try:
    if not all_anomalies.empty:
        all_anomalies.to_csv('../dataset/hasil_deteksi_anomali.csv')
        print("Hasil deteksi anomali berhasil disimpan sebagai 'hasil_deteksi_anomali.csv'")
    weekly_features.to_csv('../dataset/hasil_feature_engineering.csv')
    print("Hasil feature engineering berhasil disimpan sebagai 'hasil_feature_engineering.csv'")

except Exception as e:
    print(f"Gagal menyimpan file CSV: {e}")

print("\nProses Selesai.")

Hasil deteksi anomali berhasil disimpan sebagai 'hasil_deteksi_anomali.csv'
Hasil feature engineering berhasil disimpan sebagai 'hasil_feature_engineering.csv'

Proses Selesai.


In [11]:
import joblib

joblib.dump(model, '../models/isolation_forest_model.joblib')

['../models/isolation_forest_model.joblib']