In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')
print("✅ Libraries berhasil diimport.")

✅ Libraries berhasil diimport.


PREPROCESSING

In [None]:
try:
    df_raw = pd.read_csv('../dataset/data_isolation_forest.csv', sep=';')

    id_vars = ['text', 'jenis']
    week_cols = [col for col in df_raw.columns if col not in id_vars]
    df_long = pd.melt(df_raw, id_vars=id_vars, value_vars=week_cols, var_name='week_range', value_name='reviews')

    df_long['week_start'] = pd.to_datetime(df_long['week_range'].str.split(' - ').str[0], format='%d/%m/%Y')
    # df_long.to_csv('../dataset/transformasi_data_if.csv', index=False, sep=';')

    print(f"✅ Data berhasil dimuat dan diproses.")
    print(f"   Shape data setelah diproses: {df_long.shape}")
    print(f"   Rentang waktu data: {df_long['week_start'].min().date()} hingga {df_long['week_start'].max().date()}")

except FileNotFoundError:
    print("❌ Error: File '../dataset/data_isolation_forest.csv' tidak ditemukan.") 
    exit()

✅ Data berhasil dimuat dan diproses.
   Shape data setelah diproses: (1029912, 5)
   Rentang waktu data: 2020-01-06 hingga 2022-12-26


FEATURE ENGINEERING

In [None]:
print("Memulai proses feature engineering...")

weekly_data = df_long.groupby('week_start')

weekly_features = weekly_data.agg(
    total_reviews=('reviews', 'sum'),
    num_products_reviewed=('reviews', lambda x: (x > 0).sum()),
    max_single_product_reviews=('reviews', 'max'),
    mean_reviews=('reviews', 'mean'),
    std_reviews=('reviews', 'std')
).fillna(0)

# Fitur Gini Coefficient 
def gini(arr):
    arr = np.sort(arr)
    n = arr.shape[0]
    if n == 0 or np.sum(arr) == 0:
        return 0
    index = np.arange(1, n + 1)
    return (np.sum((2 * index - n - 1) * arr)) / (n * np.sum(arr))

weekly_features['gini_coefficient'] = weekly_data['reviews'].apply(gini)

# Fitur Week-on-Week Change)
weekly_features['week_on_week_change'] = weekly_features['total_reviews'].pct_change().fillna(0) * 100 #%
weekly_features.replace([np.inf, -np.inf], 0, inplace=True)

# Fitur Spike from Rolling Mean
rolling_mean = weekly_features['total_reviews'].rolling(window=4, min_periods=1).mean()
weekly_features['spike_from_mean'] = (weekly_features['total_reviews'] - rolling_mean) / rolling_mean
weekly_features.replace([np.inf, -np.inf], 0, inplace=True)
weekly_features['spike_from_mean'] = weekly_features['spike_from_mean'].fillna(0) * 100

print("Feature engineering selesai.")
print(f"Jumlah fitur yang dibuat: {len(weekly_features.columns)}")

weekly_features.to_csv('../dataset/feature_engineering_noduplikat.csv', sep=';')


Memulai proses feature engineering...
Feature engineering selesai.
Jumlah fitur yang dibuat: 8


In [4]:
# Standarisasi fitur
scaler = StandardScaler()
features_scaled = scaler.fit_transform(weekly_features)

features_scaled_df = pd.DataFrame(features_scaled, columns=weekly_features.columns, index=weekly_features.index)
features_scaled_df.to_csv('../dataset/preprocessed_if_data.csv', sep=';')

ANOMALY DETECTION

In [5]:
print("Memulai deteksi anomali untuk seluruh data...")

model = IsolationForest(contamination=0.1, random_state=42, n_estimators=100)
model.fit(features_scaled)

weekly_features['anomaly_flag'] = model.predict(features_scaled)
weekly_features['anomaly_score'] = model.decision_function(features_scaled)

anomalies_before_filter = weekly_features[weekly_features['anomaly_flag'] == -1]
count_before_filter = len(anomalies_before_filter)
min_score = anomalies_before_filter['anomaly_score'].min()
max_score = anomalies_before_filter['anomaly_score'].max()

print(f"Deteksi awal: Ditemukan {count_before_filter} total anomali (sebelum filtering).")
print(f"Rentang skor anomali mentah: {min_score:.6f} (paling anomali) hingga {max_score:.6f}")

# Filtering untuk lonjakan
all_anomalies = weekly_features[
    (weekly_features['anomaly_flag'] == -1) &
    (weekly_features['spike_from_mean'] > 0) &
    (weekly_features['week_on_week_change'] > 0)
].copy()

print(f"Analisis selesai: Ditemukan {len(all_anomalies)} anomali lonjakan ulasan di seluruh data.")

Memulai deteksi anomali untuk seluruh data...
Deteksi awal: Ditemukan 16 total anomali (sebelum filtering).
Rentang skor anomali mentah: -0.126158 (paling anomali) hingga -0.000270
Analisis selesai: Ditemukan 7 anomali lonjakan ulasan di seluruh data.


In [6]:
print("\nDetail Anomali Lonjakan")

if 'week_start' in all_anomalies.columns:
     print(all_anomalies[['week_start', 'anomaly_score']].sort_values(by='week_start'))
else:
     print("Menampilkan anomali (index adalah tanggal):")
     print(all_anomalies[['anomaly_score']].sort_index())


Detail Anomali Lonjakan
Menampilkan anomali (index adalah tanggal):
            anomaly_score
week_start               
2020-01-13      -0.101185
2020-03-30      -0.012202
2021-06-21      -0.039815
2021-06-28      -0.126158
2021-07-05      -0.092367
2022-02-14      -0.030135
2022-02-28      -0.092110


In [None]:
print("Memulai analisis Z-score berdasarkan PERUBAHAN ULASAN...")

try:
    list_jenis_dasar = [
        'Herbal', 'Multivitamin', 'Analgesik', 'Antipiretik', 'Dekongestan',
        'Antiseptik', 'Antasida', 'Antidiare', 'Ekspektoran'
    ]
    print(f"✅ Menggunakan {len(list_jenis_dasar)} jenis dasar.")

    df_long = pd.read_csv('../dataset/transformasi_data_if.csv', sep=';', parse_dates=['week_start'])
    print("✅ Berhasil memuat 'transformasi_data_if.csv'.")

    if 'all_anomalies' not in locals() or all_anomalies.empty:
         raise ValueError("DataFrame 'all_anomalies' tidak ditemukan atau kosong.")
    anomaly_dates = all_anomalies.index
    print(f"✅ Menggunakan {len(anomaly_dates)} tanggal anomali dari DataFrame di memori.")

    print("⏳ Memproses data 'jenis'...")
    processed_data = []
    df_subset = df_long[df_long['reviews'] > 0][['week_start', 'jenis', 'reviews']]
    for row in df_subset.itertuples():
        original_jenis_string = str(row.jenis)
        for jenis_dasar in list_jenis_dasar:
            if jenis_dasar in original_jenis_string:
                processed_data.append({
                    'week_start': row.week_start,
                    'jenis_dasar': jenis_dasar,
                    'reviews': row.reviews
                })
    df_exploded = pd.DataFrame(processed_data)

    df_weekly_individual = df_exploded.groupby(['week_start', 'jenis_dasar'])['reviews'].sum().reset_index()

    print("⏳ Membuat data mingguan penuh (termasuk 0 ulasan)...")
    all_weeks = df_long['week_start'].unique()
    multi_index = pd.MultiIndex.from_product([all_weeks, list_jenis_dasar], names=['week_start', 'jenis_dasar'])
    df_weekly_individual_full = df_weekly_individual.set_index(['week_start', 'jenis_dasar']).reindex(multi_index, fill_value=0).reset_index()

    print("⏳ Menghitung perubahan ulasan (differencing) per minggu...")
    df_weekly_individual_full = df_weekly_individual_full.sort_values(by=['jenis_dasar', 'week_start'])
    df_weekly_individual_full['reviews_change'] = df_weekly_individual_full.groupby('jenis_dasar')['reviews'].diff().fillna(0)

    print("⏳ Menghitung statistik baseline (mean, std) untuk PERUBAHAN...")
    change_stats = df_weekly_individual_full.groupby('jenis_dasar')['reviews_change'].agg(
        mean_change_jenis='mean',
        std_change_jenis='std'
    ).reset_index()
    change_stats['std_change_jenis'] = change_stats['std_change_jenis'].replace(0, 1).fillna(1)

    print("\nStatistik Baseline (Rata-rata & Std Dev) untuk PERUBAHAN MINGGUAN:")
    print(change_stats)

    print("⏳ Menghitung Z-score Perubahan untuk setiap minggu...")
    df_merged = pd.merge(
        df_weekly_individual_full,
        change_stats,
        on='jenis_dasar',
        how='left'
    )
    df_merged['z_score_change'] = (df_merged['reviews_change'] - df_merged['mean_change_jenis']) / df_merged['std_change_jenis']

    print("\n\n==========================================================================================")
    print("HASIL ANALISIS KONTRIBUTOR")
    print("==========================================================================================")

    for date in anomaly_dates:
        print(f"\n\nANOMALI TERDETEKSI PADA MINGGU: {date.strftime('%d %B %Y')}")
        print("------------------------------------------------------------------------------------------")
        print("B. ANALISIS KONTRIBUTOR (Jenis Obat dengan Perubahan Paling Ekstrem)")
        print("------------------------------------------------------------------------------------------")

        data_on_anomaly_week = df_merged[df_merged['week_start'] == date]

        if data_on_anomaly_week.empty:
            print("   Tidak ada data perubahan yang ditemukan untuk minggu ini.")
            continue

        top_lonjakan = data_on_anomaly_week.sort_values(by='z_score_change', ascending=False)

        print("   Jenis obat dengan fluktuasi paling signifikan (relatif terhadap normalnya):\n")

        for idx, sub_row in top_lonjakan.iterrows():
            jenis = sub_row['jenis_dasar']
            z = sub_row['z_score_change']
            change = sub_row['reviews_change']
            reviews_now = sub_row['reviews']
            mean_chg = sub_row['mean_change_jenis']

            reviews_prev = reviews_now - change

            print(f"   - {jenis:<12}: Z-score Ubah = {z:<7.2f} (Ulasan: {int(reviews_now):<5} [Ubah: {int(change):+d}], Minggu Lalu: {int(reviews_prev):<5}, Rata-rata Ubah: {mean_chg:+.2f})")

    print("\n\n==========================================================================================")
    print("Analisis Z-score Perubahan selesai.")

except FileNotFoundError as e:
    print(f"\n❌ ERROR: File tidak ditemukan.")
    print(f"   Pastikan Anda telah menjalankan seluruh notebook asli sehingga file-file berikut ada:")
    print(f"   - '../dataset/transformasi_data_if.csv'")
    print(f"\n   Detail error: {e}")
    print(f"\n❌ ERROR: {e}")
except Exception as e:
    print(f"\n❌ ERROR: Terjadi kesalahan saat eksekusi.")
    print(f"   Detail error: {e}")

Memulai analisis Z-score berdasarkan PERUBAHAN ULASAN...
✅ Menggunakan 9 jenis dasar.
✅ Berhasil memuat 'transformasi_data_if.csv'.
✅ Menggunakan 7 tanggal anomali dari DataFrame di memori.
⏳ Memproses data 'jenis'...
⏳ Membuat data mingguan penuh (termasuk 0 ulasan)...
⏳ Menghitung perubahan ulasan (differencing) per minggu...
⏳ Menghitung statistik baseline (mean, std) untuk PERUBAHAN...

Statistik Baseline (Rata-rata & Std Dev) untuk PERUBAHAN MINGGUAN:
    jenis_dasar  mean_change_jenis  std_change_jenis
0     Analgesik           0.358974         21.154409
1      Antasida           0.512821         15.435814
2     Antidiare           0.083333          5.553213
3   Antipiretik           0.358974         21.137934
4    Antiseptik           0.083333          9.883814
5   Dekongestan           0.243590         14.287993
6   Ekspektoran           0.134615         12.873121
7        Herbal           3.294872         83.429183
8  Multivitamin           4.474359        138.818677
⏳ Menghit

In [None]:
print("⏳ Memulai visualisasi...")
plt.style.use('seaborn-v0_8-whitegrid')

feature_groups = [
    ('total_reviews', 'num_products_reviewed'),
    ('max_single_product_reviews', 'gini_coefficient'),
    ('week_on_week_change', 'spike_from_mean'),
    ('mean_reviews', 'std_reviews')
]
colors = [
    ('royalblue', 'green'),
    ('purple', 'orange'),
    ('darkred', 'darkcyan'),
    ('brown', 'magenta')
]
ylabels = [
    ('Total Ulasan (Log Scale)', 'Jumlah Produk Diulas'),
    ('Max Ulasan 1 Produk', 'Koefisien Gini'),
    ('Perubahan WoW (%)', 'Lonjakan dari Rata-rata (%)'),
    ('Rata-rata Ulasan', 'Std Dev Ulasan')
]

for group, color_pair, ylabel_pair in zip(feature_groups, colors, ylabels):
    feature1, feature2 = group
    color1, color2 = color_pair
    ylabel1, ylabel2 = ylabel_pair

    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 10), sharex=True)
    fig.suptitle(f'Analisis Fitur {feature1} & {feature2}', fontsize=20, weight='bold')

    ax1.plot(weekly_features.index, weekly_features[feature1], color=color1, label=feature1)
    if not all_anomalies.empty:
        ax1.scatter(all_anomalies.index, all_anomalies[feature1], color='red', s=100, marker='X', zorder=5, label='Anomali')
    ax1.set_ylabel(ylabel1, fontsize=12)
    ax1.legend(loc='upper left')
    ax1.grid(True, which='both', linestyle='--', linewidth=0.5)
    if feature1 == 'total_reviews':
        ax1.set_yscale('log')

    ax2.plot(weekly_features.index, weekly_features[feature2], color=color2, label=feature2)
    if not all_anomalies.empty:
        ax2.scatter(all_anomalies.index, all_anomalies[feature2], color='red', s=100, marker='X', zorder=5)
    ax2.set_ylabel(ylabel2, fontsize=12) 
    ax2.set_xlabel('Tanggal', fontsize=12)
    ax2.legend(loc='upper left')
    ax2.grid(True, which='both', linestyle='--', linewidth=0.5)

    file_name = f'visualisasi_{feature1}_vs_{feature2}.png'
    plt.tight_layout(rect=[0, 0.03, 1, 0.96])
    plt.savefig(file_name, dpi=300)
    plt.close(fig)
    print(f"✅ Visualisasi '{file_name}' selesai dibuat.")

⏳ Memulai visualisasi...
✅ Visualisasi 'visualisasi_total_reviews_vs_num_products_reviewed.png' selesai dibuat (tidak disimpan).
✅ Visualisasi 'visualisasi_max_single_product_reviews_vs_gini_coefficient.png' selesai dibuat (tidak disimpan).
✅ Visualisasi 'visualisasi_week_on_week_change_vs_spike_from_mean.png' selesai dibuat (tidak disimpan).
✅ Visualisasi 'visualisasi_mean_reviews_vs_std_reviews.png' selesai dibuat (tidak disimpan).


In [19]:
try:
    if not all_anomalies.empty:
        all_anomalies.to_csv('../dataset/hasil_deteksi_anomali.csv')
        print("Hasil deteksi anomali berhasil disimpan sebagai 'hasil_deteksi_anomali.csv'")
    weekly_features.to_csv('../dataset/hasil_feature_engineering.csv')
    print("Hasil feature engineering berhasil disimpan sebagai 'hasil_feature_engineering.csv'")

except Exception as e:
    print(f"Gagal menyimpan file CSV: {e}")

print("\nProses Selesai.")

Hasil deteksi anomali berhasil disimpan sebagai 'hasil_deteksi_anomali.csv'
Hasil feature engineering berhasil disimpan sebagai 'hasil_feature_engineering.csv'

Proses Selesai.


In [20]:
import joblib

joblib.dump(model, '../models/isolation_forest_model.joblib')

['../models/isolation_forest_model.joblib']