In [2]:
# Import libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses

# Function to preprocess the data
def preprocess_data(data):
    """
    Preprocesses the input data by extracting date-based features and normalizing values using MinMaxScaler.

    Parameters:
        data (pd.DataFrame): Input dataframe with 'date' and 'amount' columns.

    Returns:
        np.ndarray: Scaled feature matrix.
        pd.DataFrame: Original data with extracted features.
        MinMaxScaler: Fitted scaler for potential reuse.
    """
    # Convert 'date' to datetime and extract features
    data['date'] = pd.to_datetime(data['date'])
    data['day_of_week'] = data['date'].dt.dayofweek  # Day of the week (0=Monday, 6=Sunday)
    data['month'] = data['date'].dt.month           # Month of the year (1=January, 12=December)
    data['day_of_month'] = data['date'].dt.day      # Day of the month (1-31)
    data['year'] = data['date'].dt.year             # Year
    data['day_of_year'] = data['date'].dt.dayofyear # Day of the year (1-365 or 366)

    # Select relevant features
    features = ['amount', 'day_of_week', 'month', 'day_of_month', 'year', 'day_of_year']
    data = data[features]

    # Normalize the data using MinMaxScaler
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)

    return data_scaled, data, scaler


# Function to build the autoencoder model
def build_autoencoder(input_dim):
    """
    Builds and compiles an autoencoder model.

    Parameters:
        input_dim (int): Number of input features.

    Returns:
        tf.keras.Model: Compiled autoencoder model.
    """
    autoencoder = models.Sequential([
        layers.Input(shape=(input_dim,)),       # Input layer
        layers.Dense(32, activation='relu'),    # Encoder
        layers.Dense(16, activation='relu'),    # Encoder
        layers.Dense(8, activation='relu'),     # Bottleneck
        layers.Dense(16, activation='relu'),    # Decoder
        layers.Dense(32, activation='relu'),    # Decoder
        layers.Dense(input_dim, activation='sigmoid')  # Output layer
    ])
    autoencoder.compile(optimizer=optimizers.Adam(), loss=losses.MeanSquaredError())
    return autoencoder

# Function to detect anomalies
def detect_anomalies(autoencoder, data_scaled, original_data, scaler, percentile=99):
    """
    Detects anomalies using the trained autoencoder.

    Parameters:
        autoencoder (tf.keras.Model): Trained autoencoder model.
        data_scaled (np.ndarray): Normalized input data.
        original_data (pd.DataFrame): Original data with features.
        scaler (MinMaxScaler): Scaler used for normalization.
        percentile (float): Percentile threshold for anomaly detection.

    Returns:
        pd.DataFrame: Data with anomaly status and reconstruction errors.
        pd.DataFrame: Detected anomalies.
    """
    # Reconstruct the input data
    reconstructed = autoencoder.predict(data_scaled)
    mse = np.mean(np.power(data_scaled - reconstructed, 2), axis=1)  # Reconstruction error

    # Set anomaly threshold
    threshold = np.percentile(mse, percentile)

    # Flag anomalies
    original_data['reconstruction_error'] = mse
    original_data['is_anomaly'] = mse > threshold

    # Reconstruct 'date' column
    original_data['date'] = pd.to_datetime(
        original_data['year'] * 1000 + original_data['day_of_year'],
        format='%Y%j'
    )

    # Filter anomalies
    anomalies = original_data[original_data['is_anomaly']]

    return original_data, anomalies

# Function to plot the data
def plot_results(processed_data, anomalies):
    """
    Plots the data, highlighting anomalies in red and normal data in blue, with 'date' on the x-axis and 'amount' on the y-axis.

    Parameters:
        processed_data (pd.DataFrame): Dataframe with anomaly status.
        anomalies (pd.DataFrame): Detected anomalies.
    """
    # Reconstruct a synthetic 'date' from year and day_of_year
    processed_data['date'] = pd.to_datetime(processed_data['year'] * 1000 + processed_data['day_of_year'], format='%Y%j')
    anomalies['date'] = pd.to_datetime(anomalies['year'] * 1000 + anomalies['day_of_year'], format='%Y%j')

    plt.figure(figsize=(12, 6))

    # Plot normal data
    normal_data = processed_data[~processed_data['is_anomaly']]
    plt.scatter(normal_data['date'], normal_data['amount'], color='blue', label='Normal', alpha=0.6)

    # Plot anomalies
    plt.scatter(anomalies['date'], anomalies['amount'], color='red', label='Anomaly', alpha=0.8)

    # Add labels and legend
    plt.title('Anomaly Detection in Financial Transactions')
    plt.xlabel('Date')
    plt.ylabel('Amount')
    plt.legend()
    plt.grid(True)
    plt.show()

# Main function to run the pipeline
def main_pipeline(data, epochs=50, batch_size=16, percentile=99):
    """
    Main pipeline for anomaly detection using autoencoder.

    Parameters:
        data (pd.DataFrame): Input dataframe with 'date' and 'amount' columns.
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.
        percentile (float): Percentile threshold for anomaly detection.

    Returns:
        tf.keras.Model: Trained autoencoder model.
        pd.DataFrame: Processed data with anomaly status.
        pd.DataFrame: Detected anomalies.
        MinMaxScaler: Scaler used for data normalization.
        float: Threshold for anomaly detection based on reconstruction error.
    """
    # Step 1: Preprocess the data
    data_scaled, processed_data, scaler = preprocess_data(data)

    # Step 2: Build the autoencoder
    input_dim = data_scaled.shape[1]
    autoencoder = build_autoencoder(input_dim)

    # Step 3: Train the autoencoder
    history = autoencoder.fit(data_scaled, data_scaled, epochs=epochs, batch_size=batch_size, shuffle=True, verbose=1)

    # Step 4: Detect anomalies
    processed_data, anomalies = detect_anomalies(autoencoder, data_scaled, processed_data, scaler, percentile=percentile)

    # Step 5: Financial metrics for cashflow analysis
    total_income = data[data['amount'] > 0]['amount'].sum()
    total_expense = data[data['amount'] > 0]['amount'].sum()
    advice_list = [
        "Reduce unnecessary spending",
        "Plan your spendings wisely"
    ]
    anomalies['description'] = "Unusual spending detected"

    # Step 6: Generate JSON report
    json_report = generate_json(processed_data, anomalies, total_income, total_expense, advice_list)
    with open("result_data.json", "w") as json_file:
        json_file.write(json_report)

    # Step 7: Plot results
    plot_results(processed_data, anomalies)

    return autoencoder, processed_data, anomalies, scaler

def predict_pipeline(model, data, scaler, percentile):
    """
    Pipeline to preprocess data, predict anomalies, and plot results using a trained model.

    Parameters:
        model (tf.keras.Model): Trained autoencoder model.
        data (pd.DataFrame): New data with the same structure as training data.
        scaler (MinMaxScaler): Scaler used to preprocess the training data.
        percentile (float): Percentile for anomaly detection based on reconstruction error.

    Returns:
        pd.DataFrame: New data with anomaly status and reconstruction errors.
        pd.DataFrame: Detected anomalies.
    """
    # Step 1: Preprocess new data
    # Extract date-based features if not already extracted
    if 'day_of_week' not in data.columns:
        data['date'] = pd.to_datetime(data['date'])
        data['day_of_week'] = data['date'].dt.dayofweek  # Day of the week (0=Monday, 6=Sunday)
        data['month'] = data['date'].dt.month           # Month of the year (1=January, 12=December)
        data['day_of_month'] = data['date'].dt.day      # Day of the month (1-31)
        data['year'] = data['date'].dt.year             # Year
        data['day_of_year'] = data['date'].dt.dayofyear # Day of the year (1-365 or 366)

    # Select relevant features
    features = ['amount', 'day_of_week', 'month', 'day_of_month', 'year', 'day_of_year']
    data = data[features]

    # Normalize the data using the scaler from training
    data_scaled = scaler.transform(data)

    # Step 2: Use model to reconstruct data
    reconstructed = model.predict(data_scaled)
    mse = np.mean(np.power(data_scaled - reconstructed, 2), axis=1)

    # Step 3: Detect anomalies
    # Set anomaly threshold
    threshold = np.percentile(mse, percentile)

    # Flag the anomalies
    data['reconstruction_error'] = mse
    data['is_anomaly'] = mse > threshold

    # Step 4: Filter anomalies
    anomalies = data[data['is_anomaly']]

    # Step 5: Plot results
    plot_results(data, anomalies)

    return data, anomalies

def training_pipeline(model, data, scaler, epochs=50, batch_size=16):
    """
    Simplified pipeline to preprocess data and retrain an existing autoencoder model.

    Parameters:
        model (tf.keras.Model): Pre-trained autoencoder model.
        data (pd.DataFrame): Input data with 'date' and 'amount' columns.
        scaler (MinMaxScaler): Scaler used to preprocess the data during initial training.
        epochs (int): Number of epochs for retraining.
        batch_size (int): Batch size for training.

    Returns:
        tf.keras.Model: Retrained autoencoder model.
    """
    # Step 1: Preprocess the data using the existing scaler
    if 'day_of_week' not in data.columns:
        data['date'] = pd.to_datetime(data['date'])
        data['day_of_week'] = data['date'].dt.dayofweek  # Day of the week (0=Monday, 6=Sunday)
        data['month'] = data['date'].dt.month           # Month of the year (1=January, 12=December)
        data['day_of_month'] = data['date'].dt.day      # Day of the month (1-31)
        data['year'] = data['date'].dt.year             # Year
        data['day_of_year'] = data['date'].dt.dayofyear # Day of the year (1-365 or 366)
        data = data[['amount', 'day_of_week', 'month', 'day_of_month', 'year', 'day_of_year']]

    data_scaled = scaler.transform(data)

    # Step 2: Retrain the autoencoder
    model.fit(data_scaled, data_scaled, epochs=epochs, batch_size=batch_size, shuffle=True, verbose=1)

    return model

def generate_json(processed_data, anomalies, total_income, total_expense, advice_list):
    """
    Generates a JSON report from the processed data, detected anomalies, and financial insights.

    Parameters:
        processed_data (pd.DataFrame): Dataframe with processed financial data.
        anomalies (pd.DataFrame): Dataframe containing detected anomalies.
        total_income (float): Total income in the dataset.
        total_expense (float): Total expense in the dataset.
        advice_list (list): List of financial advice as strings.

    Returns:
        str: JSON-formatted report as a string.
    """

    net_cashflow = total_income - total_expense

    anomaly_list = anomalies[['description', 'date', 'amount']].to_dict(orient='records')

    report = {
        "cashflow_analysis": {
            "total_income": total_income,
            "total_expense": total_expense,
            "net_cashflow": net_cashflow
        },
        "financial_advice": advice_list,
        "anomaly_detection": anomaly_list
    }

    return json.dumps(report, indent=2)

# Example usage
if __name__ == "__main__":
    # Example dataset
    data = pd.read_json('pengeluaran.json')

    # Run the pipeline
    autoencoder, processed_data, anomalies, scaler = main_pipeline(data, epochs=50, batch_size=4, percentile=99)

    # Display results
    print("Processed Data with Anomaly Status:")
    print(processed_data)

    print("\nDetected Anomalies:")
    print(anomalies)


Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.1146
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.0605
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0351
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0183
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0154
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0047
Epoch 7/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0036
Epoch 8/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0027
Epoch 9/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0026
Epoch 10/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - lo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomalies['description'] = "Unusual spending detected"


KeyError: "['date'] not in index"

## Checking New Data

In [None]:
bri_data = pd.read_json('bri_pengeluaran.json')

predict_pipeline(model=autoencoder, data=bri_data, scaler=scaler, percentile=99)

In [None]:
pf_data = pd.read_json('pf_pengeluaran.json')

predict_pipeline(model=autoencoder, data=pf_data, scaler=scaler, percentile=99)

## Save Model

In [None]:
import pickle

# Simpan model dalam format HDF5
# Menyimpan model ke file
autoencoder.save('autoencoder_model.h5')  # Format HDF5
# atau menggunakan format SavedModel
autoencoder.export('autoencoder_model/')


# Simpan scaler menggunakan pickle
scaler_pickle_path = "scaler.pkl"
with open(scaler_pickle_path, 'wb') as f:
    pickle.dump(scaler, f)

print(f"Scaler saved to: {scaler_pickle_path}")
