<a href="https://colab.research.google.com/github/arimbawa/Big-Data/blob/main/Big_Data_Pertemuan_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [None]:
# === Konfigurasi simulasi ===
n_sensors = 5                 # jumlah sensor
n_minutes = 120               # jumlah menit data (2 jam)
start_time = datetime(2025, 9, 6, 8, 0)  # jam mulai

# daftar sensor_id
sensor_ids = [f"sensor_{i+1}" for i in range(n_sensors)]

# === Generate data ===
timestamps = [start_time + timedelta(minutes=i) for i in range(n_minutes)]
data = []

np.random.seed(24453)
for ts in timestamps:
    for sid in sensor_ids:
        temperature = round(np.random.normal(30, 2), 2)   # suhu (°C)
        humidity = round(np.random.uniform(40, 70), 2)    # kelembapan (%)
        light = np.random.randint(200, 800)               # intensitas cahaya (lux)
        data.append([ts, sid, temperature, humidity, light])

# === Buat DataFrame ===
df = pd.DataFrame(data, columns=["timestamp", "sensor_id", "temperature", "humidity", "light"])

print(df.head(10))  # tampilkan 10 baris pertama

# Simpan ke CSV (opsional)
df.to_csv("sensor_data.csv", index=False)
print("\nData sensor berhasil dibuat dan disimpan ke sensor_data.csv")


            timestamp sensor_id  temperature  humidity  light
0 2025-09-06 08:00:00  sensor_1        28.84     63.19    732
1 2025-09-06 08:00:00  sensor_2        32.52     66.43    799
2 2025-09-06 08:00:00  sensor_3        30.49     66.14    419
3 2025-09-06 08:00:00  sensor_4        27.65     40.94    219
4 2025-09-06 08:00:00  sensor_5        28.72     65.31    739
5 2025-09-06 08:01:00  sensor_1        34.42     47.26    789
6 2025-09-06 08:01:00  sensor_2        31.93     58.76    250
7 2025-09-06 08:01:00  sensor_3        32.57     55.04    203
8 2025-09-06 08:01:00  sensor_4        29.85     54.89    324
9 2025-09-06 08:01:00  sensor_5        28.78     58.37    304

Data sensor berhasil dibuat dan disimpan ke sensor_data.csv


In [None]:
import sqlite3

Pipeline Sederhana di Python (ETL)

In [None]:
# === 1. Extract ===
df = pd.read_csv("sensor_data.csv")
# contoh kolom: timestamp, sensor_id, temperature

# === 2. Transform ===
# ubah timestamp ke datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# hitung rata-rata suhu per jam
df_hourly = df.set_index('timestamp').groupby(['sensor_id']).resample('1h').mean().reset_index()

# === 3. Load ===
conn = sqlite3.connect("warehouse.db")
df_hourly.to_sql("sensor_hourly", conn, if_exists="replace", index=False)

print("ETL selesai! Data masuk ke SQLite warehouse.")
df_result = pd.read_sql("SELECT * FROM sensor_hourly", conn)
print(df_result.head())

ETL selesai! Data masuk ke SQLite warehouse.
  sensor_id            timestamp  temperature   humidity       light
0  sensor_1  2025-09-06 08:00:00    30.004667  54.517333  504.833333
1  sensor_1  2025-09-06 09:00:00    29.230000  55.197333  506.216667
2  sensor_2  2025-09-06 08:00:00    30.388167  55.244833  511.983333
3  sensor_2  2025-09-06 09:00:00    29.671500  55.550833  528.066667
4  sensor_3  2025-09-06 08:00:00    29.558833  56.734667  473.433333


Pipeline Sederhana di Python (ELT)

In [None]:
# === 1. Extract ===
df = pd.read_csv("sensor_data.csv")

# === 2. Load (langsung raw data) ===
conn = sqlite3.connect("warehouse_raw.db")
df.to_sql("sensor_raw", conn, if_exists="replace", index=False)

# === 3. Transform (di dalam DW dengan SQL) ===
query = """
CREATE TABLE sensor_hourly AS
SELECT sensor_id,
       strftime('%Y-%m-%d %H:00:00', timestamp) AS hour,
       AVG(temperature) AS avg_temp,
       AVG(humidity) AS avg_hum,
       AVG(light) AS avg_light
FROM sensor_raw
GROUP BY sensor_id, hour;
"""
conn.execute("DROP TABLE IF EXISTS sensor_hourly;")
conn.execute(query)
print("ELT selesai! Data raw + transformasi ada di SQLite warehouse.")

df_result = pd.read_sql("SELECT * FROM sensor_hourly", conn)
print(df_result.head())

ELT selesai! Data raw + transformasi ada di SQLite warehouse.
  sensor_id                 hour   avg_temp    avg_hum   avg_light
0  sensor_1  2025-09-06 08:00:00  30.004667  54.517333  504.833333
1  sensor_1  2025-09-06 09:00:00  29.230000  55.197333  506.216667
2  sensor_2  2025-09-06 08:00:00  30.388167  55.244833  511.983333
3  sensor_2  2025-09-06 09:00:00  29.671500  55.550833  528.066667
4  sensor_3  2025-09-06 08:00:00  29.558833  56.734667  473.433333


Pipeline Python Smart City

In [None]:
# === 1. Simulasi Big Data (Extract) ===
# Buat data sensor lalu lintas acak untuk 3 lokasi selama 1 hari
np.random.seed(42)
start_time = datetime(2025, 9, 6, 0, 0)
timestamps = [start_time + timedelta(minutes=i) for i in range(24*60)]  # 1 hari, per menit
locations = ["Jl. Majapahit", "Jl. Sriwijaya", "Jl. Airlangga"]

data = []
for ts in timestamps:
    for loc in locations:
        vehicles = np.random.poisson(lam=np.random.randint(10, 50))  # distribusi acak jumlah kendaraan
        data.append([ts, loc, vehicles])

df_raw = pd.DataFrame(data, columns=["timestamp", "location", "vehicles"])

# === 2. Load ke Data Warehouse (SQLite) ===
conn = sqlite3.connect("smartcity_dw.db")
df_raw.to_sql("traffic_raw", conn, if_exists="replace", index=False)

# === 3. Transformasi & Analisis (di DW) ===
query = """
CREATE TABLE traffic_hourly AS
SELECT location,
       strftime('%Y-%m-%d %H:00:00', timestamp) AS hour,
       AVG(vehicles) AS avg_vehicles
FROM traffic_raw
GROUP BY location, hour;
"""
conn.execute("DROP TABLE IF EXISTS traffic_hourly;")
conn.execute(query)

# Ambil hasil analisis
df_result = pd.read_sql("SELECT * FROM traffic_hourly", conn)

print("Contoh hasil analisis (5 baris pertama):")
print(df_result.head())

# Simulasi Smart City Insight
jam_macet = df_result[df_result["avg_vehicles"] > 30]
print("\nJam rawan macet terdeteksi:")
print(jam_macet.head())


Contoh hasil analisis (5 baris pertama):
        location                 hour  avg_vehicles
0  Jl. Airlangga  2025-09-06 00:00:00     29.083333
1  Jl. Airlangga  2025-09-06 01:00:00     28.133333
2  Jl. Airlangga  2025-09-06 02:00:00     28.766667
3  Jl. Airlangga  2025-09-06 03:00:00     32.000000
4  Jl. Airlangga  2025-09-06 04:00:00     28.566667

Jam rawan macet terdeteksi:
         location                 hour  avg_vehicles
3   Jl. Airlangga  2025-09-06 03:00:00     32.000000
8   Jl. Airlangga  2025-09-06 08:00:00     30.166667
11  Jl. Airlangga  2025-09-06 11:00:00     30.683333
12  Jl. Airlangga  2025-09-06 12:00:00     30.816667
14  Jl. Airlangga  2025-09-06 14:00:00     30.783333
