In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import pickle

#1. Memuat dan Membersihkan Data

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,date,title,category,account,amount,currency,type,transfer-amount,transfer-currency,to-account,receive-amount,receive-currency,description,due-date,id,year,month
0,2024-08-11 13:56:59.652,Karthik,Bills & Fees,Savings Bank,45.0,INR,EXPENSE,,,,,,,,74e78631-db14-4495-bfb9-85546b0bd2fe,2024,August
1,2024-08-10 16:09:55.986,Juice,Food & Drinks,Cash,40.0,INR,EXPENSE,,,,,,,,65e12e62-9f63-4c6c-b452-6c7b42fbfb7f,2024,August
2,2024-08-09 10:25:21.618,Tire,Transport,Cash,10.0,INR,EXPENSE,,,,,,,,9ecd93bd-a835-4263-86e2-99fea475fa37,2024,August
3,2024-08-07 03:57:24.944,Baba,Bills & Fees,Savings Bank,200.0,INR,EXPENSE,,,,,,,,00d39b2c-e722-485a-85ca-28f6506dc674,2024,August
4,2024-08-04 13:09:08.452,Reward,Bills & Fees,Salary Bank,4.0,INR,INCOME,,,,,,,,3861d205-3245-4926-ad69-4491b0bff547,2024,August


In [3]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [4]:
initial_rows = len(df)
df.dropna(subset=['date', 'category', 'account', 'type', 'amount'], inplace=True)
df['title'].fillna('Unknown', inplace=True)
print(f"Membersihkan data: {initial_rows - len(df)} baris dihapus karena data tidak lengkap.")

Membersihkan data: 0 baris dihapus karena data tidak lengkap.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['title'].fillna('Unknown', inplace=True)


#2. Feature Engineering

In [5]:
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek  # Senin=0, Minggu=6
df['day_of_month'] = df['date'].dt.day

In [6]:
df['daily_account_frequency'] = df.groupby([df['date'].dt.date, 'account'])['date'].transform('count')
print("Fitur baru (jam, hari, frekuensi) telah dibuat.")

Fitur baru (jam, hari, frekuensi) telah dibuat.


#3. Pra-pemrosesan Data

In [7]:
numerical_features = ['amount', 'hour', 'day_of_week', 'day_of_month', 'daily_account_frequency']
categorical_features = ['category', 'account', 'type']

print(f"Fitur numerik yang digunakan: {numerical_features}")
print(f"Fitur kategorikal yang digunakan: {categorical_features}")

Fitur numerik yang digunakan: ['amount', 'hour', 'day_of_week', 'day_of_month', 'daily_account_frequency']
Fitur kategorikal yang digunakan: ['category', 'account', 'type']


In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

#4. Pelatihan Model Deteksi Anomali

**a. Isolation Forest**

In [9]:
print("\n--- Melatih Model Isolation Forest ---")
iso_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', IsolationForest(contamination='auto', random_state=42))
])
iso_forest_pipeline.fit(df)

df['iso_forest_anomaly'] = iso_forest_pipeline.predict(df)
print("Model Isolation Forest selesai dilatih.")


--- Melatih Model Isolation Forest ---
Model Isolation Forest selesai dilatih.


**b. One-Class SVM (Sebagai Alternatif)**

In [13]:
print("\n--- Melatih Model One-Class SVM ---")
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', OneClassSVM(nu=0.05, kernel='rbf', gamma='auto'))
])
svm_pipeline.fit(df)

df['svm_anomaly'] = svm_pipeline.predict(df)
print("Model One-Class SVM selesai dilatih.")


--- Melatih Model One-Class SVM ---
Model One-Class SVM selesai dilatih.


#5. Analisis Hasil

In [11]:
print("\n--- Ringkasan Hasil Deteksi Anomali ---")

iso_anomalies_count = (df['iso_forest_anomaly'] == -1).sum()
svm_anomalies_count = (df['svm_anomaly'] == -1).sum()
print(f"Isolation Forest mendeteksi: {iso_anomalies_count} anomali.")
print(f"One-Class SVM mendeteksi: {svm_anomalies_count} anomali.")

print("\n--- Contoh Anomali Terdeteksi oleh Isolation Forest (diurutkan berdasarkan jumlah) ---")
iso_anomalies_df = df[df['iso_forest_anomaly'] == -1].sort_values(by='amount', ascending=False)
print(iso_anomalies_df[['date', 'title', 'category', 'amount', 'type', 'account']].head())

print("\n--- Contoh Anomali Terdeteksi oleh One-Class SVM (diurutkan berdasarkan jumlah) ---")
svm_anomalies_df = df[df['svm_anomaly'] == -1].sort_values(by='amount', ascending=False)
print(svm_anomalies_df[['date', 'title', 'category', 'amount', 'type', 'account']].head())


--- Ringkasan Hasil Deteksi Anomali ---
Isolation Forest mendeteksi: 389 anomali.
One-Class SVM mendeteksi: 47 anomali.

--- Contoh Anomali Terdeteksi oleh Isolation Forest (diurutkan berdasarkan jumlah) ---
                       date          title      category  amount     type  \
436 2024-02-08 05:00:00.000          Shirt  Bills & Fees   999.0  EXPENSE   
433 2024-02-09 13:17:00.000          Pants  Bills & Fees   998.0  EXPENSE   
444 2024-02-03 15:15:18.470  Baba birthday  Bills & Fees   839.0  EXPENSE   
839 2023-06-07 15:19:49.292      Exam fees  Bills & Fees   800.0  EXPENSE   
618 2023-11-25 14:23:29.453      Exam fees  Bills & Fees   800.0  EXPENSE   

          account  
436  Savings Bank  
433  Savings Bank  
444  Savings Bank  
839  Savings Bank  
618  Savings Bank  

--- Contoh Anomali Terdeteksi oleh One-Class SVM (diurutkan berdasarkan jumlah) ---
                       date          title      category  amount     type  \
436 2024-02-08 05:00:00.000          Shirt  Bi

In [12]:
#Menyimpan model yang sudah dilatih ke file .pkl
output_model_iso = 'isolation_forest_model.pkl'
with open(output_model_iso, 'wb') as file:
    pickle.dump(iso_forest_pipeline, file)
print(f"Model Isolation Forest telah disimpan ke file '{output_model_iso}'")

output_model_svm = 'one_class_svm_model.pkl'
with open(output_model_svm, 'wb') as file:
    pickle.dump(svm_pipeline, file)
print(f"Model One-Class SVM telah disimpan ke file '{output_model_svm}'")

print("\nProses selesai.")

Model Isolation Forest telah disimpan ke file 'isolation_forest_model.pkl'
Model One-Class SVM telah disimpan ke file 'one_class_svm_model.pkl'

Proses selesai.
