In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [32]:
df = pd.read_excel("/content/drive/MyDrive/DATA FEATURE PREPROCESSING/Dataset Cuaca.xlsx", header=2)
# Promote the first row to be the column names
df.columns = df.iloc[0]
# Drop the row that just became the header
df = df[1:].reset_index(drop=True)
# Clean column names by removing units and extra spaces
df.columns = df.columns.astype(str).str.replace(r' \(.*\)', '', regex=True).str.strip()
print("DATA AWAL:")
display(df.head())

DATA AWAL:


Unnamed: 0,time,temperature_2m,relative_humidity_2m,precipitation,wind_speed_10m,cloud_cover
0,2025-10-30 00:00:00,27.8,84,1.3,13.8,100
1,2025-10-30 01:00:00,27.6,84,0.7,13.7,100
2,2025-10-30 02:00:00,27.2,85,1.1,13.0,100
3,2025-10-30 03:00:00,27.2,85,0.5,12.6,100
4,2025-10-30 04:00:00,27.3,84,0.1,11.9,100


In [33]:
print("CEK MISSING VALUE:")
display(df.isnull().sum())

CEK MISSING VALUE:


Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
time,0
temperature_2m,0
relative_humidity_2m,0
precipitation,0
wind_speed_10m,0
cloud_cover,0


In [34]:
df = df.fillna(df.median(numeric_only=True))

In [35]:
Q1 = df.quantile(0.25, numeric_only=True)
Q3 = df.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df.clip(lower, upper, axis=1)

In [36]:
numeric_cols = ["temperature_2m", "relative_humidity_2m", "precipitation",
                "wind_speed_10m", "cloud_cover"]

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("HASIL SCALING:")
display(df.head())

HASIL SCALING:


Unnamed: 0,time,temperature_2m,relative_humidity_2m,precipitation,wind_speed_10m,cloud_cover
0,2025-10-30 00:00:00,-2.186874,1.615486,3.221918,2.023666,0.646082
1,2025-10-30 01:00:00,-2.552411,1.615486,1.514628,1.993702,0.646082
2,2025-10-30 02:00:00,-3.283484,1.937243,2.652821,1.783948,0.646082
3,2025-10-30 03:00:00,-3.283484,1.937243,0.945531,1.664088,0.646082
4,2025-10-30 04:00:00,-3.100716,1.615486,-0.192663,1.454335,0.646082


In [37]:
df["rain_status"] = df["precipitation"].apply(lambda x: 1 if x > 0 else 0)
print("Fitur rain_status:")
display(df[["precipitation", "rain_status"]].head())


# ----- 7.2 Ekstraksi Waktu -----
df["time"] = pd.to_datetime(df["time"])
df["hour"] = df["time"].dt.hour
df["day"] = df["time"].dt.day
df["month"] = df["time"].dt.month
df["weekday"] = df["time"].dt.weekday

print("Fitur Waktu:")
display(df[["time", "hour", "day", "month", "weekday"]].head())


# ----- 7.3 Heat Index -----
df["heat_index"] = 0.5 * (df["temperature_2m"] +
                          (df["relative_humidity_2m"] / 100) * df["temperature_2m"])

print("Fitur heat_index:")
display(df[["temperature_2m", "relative_humidity_2m", "heat_index"]].head())


# ----- 7.4 Wind Pressure -----
df["wind_pressure"] = df["wind_speed_10m"] * 0.6

print("Fitur wind_pressure:")
display(df[["wind_speed_10m", "wind_pressure"]].head())


# ----- 7.5 Fitur Interaksi suhu Ã— kelembapan -----
df["temp_humidity_interaction"] = (
    df["temperature_2m"] * df["relative_humidity_2m"]
)

print("Fitur Interaksi:")
display(df[["temp_humidity_interaction"]].head())

Fitur rain_status:


Unnamed: 0,precipitation,rain_status
0,3.221918,1
1,1.514628,1
2,2.652821,1
3,0.945531,1
4,-0.192663,0


Fitur Waktu:


Unnamed: 0,time,hour,day,month,weekday
0,2025-10-30 00:00:00,0,30,10,3
1,2025-10-30 01:00:00,1,30,10,3
2,2025-10-30 02:00:00,2,30,10,3
3,2025-10-30 03:00:00,3,30,10,3
4,2025-10-30 04:00:00,4,30,10,3


Fitur heat_index:


Unnamed: 0,temperature_2m,relative_humidity_2m,heat_index
0,-2.186874,1.615486,-1.111101
1,-2.552411,1.615486,-1.296822
2,-3.283484,1.937243,-1.673547
3,-3.283484,1.937243,-1.673547
4,-3.100716,1.615486,-1.575404


Fitur wind_pressure:


Unnamed: 0,wind_speed_10m,wind_pressure
0,2.023666,1.2142
1,1.993702,1.196221
2,1.783948,1.070369
3,1.664088,0.998453
4,1.454335,0.872601


Fitur Interaksi:


Unnamed: 0,temp_humidity_interaction
0,-3.532865
1,-4.123384
2,-6.360906
3,-6.360906
4,-5.009163


In [38]:
additional_cols = ["heat_index", "wind_pressure", "temp_humidity_interaction"]

df[additional_cols] = scaler.fit_transform(df[additional_cols])

print("Scaling fitur baru:")
display(df[additional_cols].head())

Scaling fitur baru:


Unnamed: 0,heat_index,wind_pressure,temp_humidity_interaction
0,-2.212609,2.023666,-2.548076
1,-2.583793,1.993702,-3.099833
2,-3.336719,1.783948,-5.19048
3,-3.336719,1.664088,-5.19048
4,-3.140569,1.454335,-3.927468


In [39]:
print("INFO DATA AKHIR:")
print(df.info())

print("DATASET FINAL:")
display(df.head())

INFO DATA AKHIR:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   time                       288 non-null    datetime64[ns]
 1   temperature_2m             288 non-null    float64       
 2   relative_humidity_2m       288 non-null    float64       
 3   precipitation              288 non-null    float64       
 4   wind_speed_10m             288 non-null    float64       
 5   cloud_cover                288 non-null    float64       
 6   rain_status                288 non-null    int64         
 7   hour                       288 non-null    int32         
 8   day                        288 non-null    int32         
 9   month                      288 non-null    int32         
 10  weekday                    288 non-null    int32         
 11  heat_index                 288 non-null    float64    

Unnamed: 0,time,temperature_2m,relative_humidity_2m,precipitation,wind_speed_10m,cloud_cover,rain_status,hour,day,month,weekday,heat_index,wind_pressure,temp_humidity_interaction
0,2025-10-30 00:00:00,-2.186874,1.615486,3.221918,2.023666,0.646082,1,0,30,10,3,-2.212609,2.023666,-2.548076
1,2025-10-30 01:00:00,-2.552411,1.615486,1.514628,1.993702,0.646082,1,1,30,10,3,-2.583793,1.993702,-3.099833
2,2025-10-30 02:00:00,-3.283484,1.937243,2.652821,1.783948,0.646082,1,2,30,10,3,-3.336719,1.783948,-5.19048
3,2025-10-30 03:00:00,-3.283484,1.937243,0.945531,1.664088,0.646082,1,3,30,10,3,-3.336719,1.664088,-5.19048
4,2025-10-30 04:00:00,-3.100716,1.615486,-0.192663,1.454335,0.646082,0,4,30,10,3,-3.140569,1.454335,-3.927468


In [40]:

df.to_excel("/content/drive/MyDrive/DATA FEATURE PREPROCESSING/Dataset Feature Engginering.xlsx", index=False)
print("File berhasil disimpan sebagai dataset_final_feature_engineering.xlsx")

File berhasil disimpan sebagai dataset_final_feature_engineering.xlsx
