Tugas Besar Teknik Penambangan Data - Jakarta Air Quality Index Classification

Kevin Philips Tanamas - 220711789
Richard Angelico - 220711747
Dhiaz Juan - 220711695
Nathanael Esmond - 

In [2]:
# Import semua library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import tensorflow as tf
import altair as alt
import plotly.graph_objects as go
import plotly.express as px
import missingno as msno

from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from pandas.api.types import is_numeric_dtype
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import load_model

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

ModuleNotFoundError: No module named 'imblearn'

DATA LOADING

In [None]:
# Data Loading
all_data = pd.read_csv(r"ispu_dki_all.csv")
all_data.head(10)

DATA CLEANSING

In [None]:
# Preprocessing Data - Missing Value (sebelum imputasi)
datasets = {
    "Jakarta Highest AQI Data": all_data
}

for name, data in datasets.items():
    print(f"\n{name} Dataset:")
    print(data.info())
    print("\nMissing Value Report:")
    print(data.isnull().sum())

    msno.matrix(data, figsize=(10, 5), fontsize=12)
    plt.title(f"Before Imputation Missing Value Matrix: {name}", fontsize=14)
    plt.show()

In [None]:
# Preprocessing Data - Data Cleansing pada dataframe all_data
all_data.drop(columns=['pm25'], inplace=True)

numerical_columns = all_data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_columns:
    all_data[col] = all_data[col].fillna(all_data[col].mean())

categorical_columns = all_data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

print(all_data.isnull().sum())

In [None]:
# Preprocessing Data - Missing Value (sesudah imputasi dan dropping)
datasets = {
    "Jakarta Highest AQI Data": all_data
}

for name, data in datasets.items():
    print(f"\n{name} Dataset:")
    print(data.info())
    print("\nMissing Value Report:")
    print(data.isnull().sum())

    msno.matrix(data, figsize=(10, 5), fontsize=12)
    plt.title(f"After Imputation Missing Value Matrix: {name}", fontsize=14)
    plt.show()

In [None]:
# Menambahkan atribut rata-rata polutan per hari dan indikator weekend/weekday

all_data['rata_rata_polutan'] = all_data[['pm10', 'so2', 'co', 'o3', 'no2']].mean(axis=1)
all_data['tanggal'] = pd.to_datetime(all_data['tanggal'])
all_data['weekday_weekend'] = all_data['tanggal'].dt.dayofweek.apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')


In [None]:
# Konversi datetime dan hapus kolom 'tanggal'

all_data['year'] = pd.to_datetime(all_data['tanggal']).dt.year
all_data['month'] = pd.to_datetime(all_data['tanggal']).dt.month
all_data['day'] = pd.to_datetime(all_data['tanggal']).dt.day
all_data.drop(columns=['tanggal'], inplace=True)

In [None]:
all_data.head(5)

Exploratory Data Analysis (EDA)

In [None]:
numerical_cols = ['pm10', 'so2', 'co', 'o3', 'no2', 'rata_rata_polutan']

In [None]:
# Visualisasi Distribusi Kolom Numerik
alt.Chart(all_data).transform_fold(
    numerical_cols,
    as_=['Variable', 'Value']
).mark_area(
    opacity=0.5
).encode(
    alt.X('Value:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()'),
    alt.Color('Variable:N')
).properties(
    title='Distribution of Numerical Columns',
    width=600,
    height=400
).interactive()

In [None]:
# Visualisasi Outlier
plt.figure(figsize=(12, 6))
sns.boxplot(data=all_data[numerical_cols], palette='Set3')
plt.title('Boxplot untuk Deteksi Outlier', fontsize=14)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Visualisasi Heatmap Korelasi Antar Variabel

plt.figure(figsize=(10, 8))
corr_matrix = all_data[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt.title('Correlation Matrix Antar Variabel', fontsize=14)
plt.show()


In [None]:
# Visualisasi Rata-rata polutan dari waktu ke waktu
temp_df = all_data
temp_df['datetime'] = pd.to_datetime(all_data[['year', 'month', 'day']])

alt.Chart(temp_df).mark_line().encode(
    x=alt.X('datetime:T', title='Waktu'),
    y=alt.Y('rata_rata_polutan:Q', title='Rata-rata Polutan'),
    tooltip=['datetime:T', 'average_pollutant:Q']
).properties(
    title='Rata-rata Polutan dari Waktu ke Waktu',
    width=800,
    height=400
).interactive()

In [None]:
# Visualisasi Categori dari Waktu ke Waktu

data_categori = all_data.groupby(['year', 'month', 'categori']).size().reset_index(name='count')

chart = alt.Chart(data_categori).mark_bar().encode(
    x=alt.X('month:O', title='Month'), 
    y=alt.Y('count:Q', title='Count'),
    color='categori:N',  
    column='year:O',  
).properties(
    title='Distribusi Kategori Kualitas Udara Jakarta dari Waktu ke Waktu'
)

chart.show()

PENGECEKAN DATA DUPLIKAT

In [None]:
# Pengecekan data duplikat
print("Before checking for duplicates: ", all_data.shape)
all_data = all_data[~all_data.duplicated(keep='last')]
print("After checking for duplicates: ", all_data.shape)

PEMBERSIHAN OUTLIER

In [None]:
# Pembersihan Data Outlier memakai IQR

cols = ['pm10', 'so2', 'co', 'o3', 'no2', 'rata_rata_polutan']

def remove_outlier(df_in, cols):
    df_out = df_in.copy()
    for col_name in cols:
        if is_numeric_dtype(df_out[col_name]):  # Cek apakah kolom numerik
            q1 = df_out[col_name].quantile(0.25)
            q3 = df_out[col_name].quantile(0.75)
            iqr = q3 - q1
            upper_bound = q3 + (iqr * 1.5)
            lower_bound = q1 - (iqr * 1.5)

            # Hapus baris yang berada di luar batas IQR untuk kolom tersebut
            df_out = df_out[(df_out[col_name] >= lower_bound) & (df_out[col_name] <= upper_bound)]
    return df_out

# Panggil fungsi untuk membersihkan data
all_data_cleaned = remove_outlier(all_data, cols)

# Cek hasil pembersihan
print("Number of rows before removing outliers: ", all_data.shape[0])
print("Number of rows after removing outliers: ", all_data_cleaned.shape[0])

DATA ENCODING

In [None]:
# Label Encoding untuk Data Non-Numerik
label_encoder = LabelEncoder()
all_data_cleaned['stasiun'] = label_encoder.fit_transform(all_data_cleaned['stasiun'])
all_data_cleaned['critical'] = label_encoder.fit_transform(all_data_cleaned['critical'])
all_data_cleaned['weekday_weekend'] = label_encoder.fit_transform(all_data_cleaned['weekday_weekend'])
all_data_cleaned['categori'] = label_encoder.fit_transform(all_data_cleaned['categori'])
all_data_cleaned.drop(columns=['datetime'], inplace=True)

In [None]:
# Feature Scaling & Train-test split

x = all_data_cleaned.drop(['categori', 'stasiun'], axis = 1).values
y = all_data_cleaned.categori.values

# Normalisasi Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(X_scaled,y,test_size=0.25, random_state=15)

smote = SMOTE(random_state=42)
x_train, y_train = smote.fit_resample(X_train, y_train)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# Random Forest Modeling
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(x_train, y_train)

# Predictions
y_pred_rf = rf_model.predict(x_test)

print("Building Model Random Forest...")
print(classification_report(y_test, y_pred_rf))
print("Feature Importances:", rf_model.feature_importances_)

In [None]:
# Suport Vector Machine Model

svm_model = SVC(kernel='rbf', probability=True, random_state=42)

# Model Training
svm_model.fit(x_train, y_train)

# Predictions
y_pred_svm = svm_model.predict(x_test)
y_pred_svm_proba = svm_model.predict_proba(x_test)

print("Building Model SVM...")
print(classification_report(y_test, y_pred_rf))
print("Feature Importances:", svm_model.feature_importances_)

In [None]:
# Multi-layer Perceptron Classifier Model

mlp_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', solver='adam', max_iter=300, random_state=42)
mlp_model.fit(x_train, y_train)

# Predictions
y_pred_mlp = mlp_model.predict(x_test)

print("Building Model MLP...")
print(classification_report(y_test, y_pred_rf))
print("Feature Importances:", mlp_model.feature_importances_)

In [None]:
# Evaluasi 3 Model Klasifikasi - Confusion Matrix

model_names = ['Random Forest', 'MLP', 'SVM']
conf_matrices = [
    confusion_matrix(y_test, y_pred_rf),
    confusion_matrix(y_test, y_pred_mlp),
    confusion_matrix(y_test, y_pred_svm)
]

fig, axes = plt.subplots(1, 3, figsize=(30, 8))
for i, (model_name, conf_matrix) in enumerate(zip(model_names, conf_matrices)):
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu", ax=axes[i])
    axes[i].set_title(f'{model_name} Confusion Matrix')
    axes[i].set_xlabel("Predicted")
    axes[i].set_ylabel("Actual")
plt.tight_layout()
plt.show()