# Data Preprocessing untuk Project SafeFood

import library yang dibutuhkan : 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

print("Library berhasil di import")

Library berhasil di import


Load dataset : 

In [None]:
data = pd.read_csv("../data/raw/data_donor_recipient_matching.csv")

print("Data berhasil di load")

Data berhasil di load


## 1. Pra-Pemrosesan Data

### a. One-Hot Encoding

In [3]:
kolom_kategori = [
    'makanan_disumbangkan',
    'kondisi_makanan',
    'makanan_dibutuhkan',
    'kondisi_makanan_diterima',
    'status_penerima'
]

data_encoded = pd.get_dummies(data, columns=kolom_kategori)

print("One-Hot Encoding selesai.")

One-Hot Encoding selesai.


### b. Normalisasi

In [4]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  
    return c * r

data_encoded['jarak'] = haversine(
    data_encoded['lokasi_lat_penyumbang'],
    data_encoded['lokasi_lon_penyumbang'],
    data_encoded['lokasi_lat_penerima'],
    data_encoded['lokasi_lon_penerima']
)

columns_to_drop = ['lokasi_lat_penyumbang', 'lokasi_lon_penyumbang', 'lokasi_lat_penerima', 'lokasi_lon_penerima']
data_encoded.drop(columns=columns_to_drop, inplace=True)

print("Kolom latitude dan longitude dihapus.")

columns_to_normalize = ['jumlah_disumbangkan', 'jumlah_dibutuhkan', 'frekuensi_menerima', 'jarak']

def min_max_scaling(df, columns):
    min_max_values = {}
    for col in columns:
        col_min = df[col].min()
        col_max = df[col].max()
        min_max_values[col] = {'min': col_min, 'max': col_max}
        
        df[col] = (df[col] - col_min) / (col_max - col_min)
    return df, min_max_values

df_normalized, min_max_dict = min_max_scaling(data_encoded, columns_to_normalize)

print("Data selesai dinormalisasi")

Kolom latitude dan longitude dihapus.
Data selesai dinormalisasi


### c. Ubah nilai Boolean ke 0/1

In [5]:
boolean_columns = [
    'is_halal_donor', 'is_for_child_donor', 'is_for_elderly_donor', 'is_alergan',
    'is_halal_receiver', 'is_for_child_receiver', 'is_for_elderly_receiver', 'is_alergan_free',
    'makanan_disumbangkan_makanan', 'makanan_disumbangkan_makanan_minuman', 
    'makanan_disumbangkan_minuman', 'kondisi_makanan_hampir_kadaluarsa', 
    'kondisi_makanan_layak_konsumsi', 'kondisi_makanan_tidak_layak_konsumsi', 
    'makanan_dibutuhkan_makanan', 'makanan_dibutuhkan_makanan_minuman', 
    'makanan_dibutuhkan_minuman', 'kondisi_makanan_diterima_hampir_kadaluarsa', 
    'kondisi_makanan_diterima_layak_konsumsi', 
    'kondisi_makanan_diterima_layak_konsumsi_hampir_kadaluarsa', 
    'kondisi_makanan_diterima_tidak_layak konsumsi', 
    'status_penerima_mendesak', 'status_penerima_normal', 'status_penerima_tidak mendesak'
]

df_for_model = df_normalized

df_for_model[boolean_columns] = df_normalized[boolean_columns].astype(int)

df_for_model.to_csv("../data/processed/data_for_model.csv", index=False)

print("Proses preprocessing selesai.")
print("Data berhasil disimpan ke file CSV.")


Proses preprocessing selesai.
Data berhasil disimpan ke file CSV.


## 2. Pemisahan Data

`Pemisahan data berdasarkan persentase:`

- Training Set: 80% x 10,000 = 8,000 data
- Validation Set: 10% x 10,000 = 1,000 data
- Test Set: 10% x 10,000 = 1,000 data

Training set digunakan untuk melatih model.

Validation set digunakan untuk memantau kinerja model selama pelatihan dan mencegah overfitting.

Test set digunakan untuk mengukur performa akhir model setelah pelatihan selesai.


In [6]:
train_set, temp_set = train_test_split(df_for_model, test_size=0.2, random_state=42)

validation_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42)

print(f"Training Set: {train_set.shape[0]} data")
print(f"Validation Set: {validation_set.shape[0]} data")
print(f"Test Set: {test_set.shape[0]} data")

train_set.to_csv('../data/processed/training_set.csv', index=False)
validation_set.to_csv('../data/processed/validation_set.csv', index=False)
test_set.to_csv('../data/processed/test_set.csv', index=False)

print("Dataset telah disimpan ke file CSV:")
print("- training_set.csv")
print("- validation_set.csv")
print("- test_set.csv")

print("Proses pemisahan dataset selesai.")

Training Set: 8000 data
Validation Set: 1000 data
Test Set: 1000 data
Dataset telah disimpan ke file CSV:
- training_set.csv
- validation_set.csv
- test_set.csv
Proses pemisahan dataset selesai.


In [7]:
print("Proses persiapan data selesai.")

Proses persiapan data selesai.
