# **Data Prep & Cleaning**

In [45]:
import pandas as pd

In [46]:
# Load the .xls file
# df = pd.read_csv('/content/Data Indeks Standar Pencemar Udara (ISPU) di Provinsi DKI Jakarta.csv')
df = pd.read_csv('Data Indeks Standar Pencemar Udara (ISPU) di Provinsi DKI Jakarta.csv')

In [47]:
df.head()

Unnamed: 0,periode_data,bulan,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori
0,202401,1,21,DKI3 Jagakarsa,51.0,65.0,45.0,9.0,8.0,79.0,79.0,,SEDANG
1,202401,1,22,DKI3 Jagakarsa,27.0,34.0,45.0,5.0,8.0,56.0,56.0,,SEDANG
2,202401,1,23,DKI3 Jagakarsa,,52.0,46.0,6.0,9.0,51.0,52.0,PM25,SEDANG
3,202401,1,24,DKI3 Jagakarsa,46.0,65.0,46.0,8.0,9.0,38.0,65.0,PM25,SEDANG
4,202401,1,25,DKI3 Jagakarsa,37.0,55.0,47.0,7.0,11.0,28.0,55.0,PM25,SEDANG


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   periode_data               760 non-null    int64  
 1   bulan                      760 non-null    int64  
 2   tanggal                    760 non-null    int64  
 3   stasiun                    760 non-null    object 
 4   pm_sepuluh                 749 non-null    float64
 5   pm_duakomalima             752 non-null    float64
 6   sulfur_dioksida            754 non-null    float64
 7   karbon_monoksida           751 non-null    float64
 8   ozon                       755 non-null    float64
 9   nitrogen_dioksida          744 non-null    float64
 10  max                        756 non-null    float64
 11  parameter_pencemar_kritis  736 non-null    object 
 12  kategori                   760 non-null    object 
dtypes: float64(7), int64(3), object(3)
memory usage: 7

In [49]:
df['periode_data'] = df['periode_data'].apply(lambda x: str(x)[:4])
df = df.rename(columns={'periode_data': 'tahun'})

In [50]:
# Check and count rows with "N/A" values
na_count = df.isna().sum().sum()
print(f'Total rows with "N/A" values: {na_count}')

Total rows with "N/A" values: 83


In [51]:
# Delete rows with "N/A" values
df = df.dropna().reset_index(drop=True)
na_count = df.isna().sum().sum()
print(f'Total rows with "N/A" values: {na_count}')

Total rows with "N/A" values: 0


In [52]:
# Create the "date" column using "periode_data", "bulan", and "tanggal" columns
df.loc[:, 'date'] = pd.to_datetime(df['tahun'].apply(lambda x: str(x)[:4]).astype(str) +
                                   '-' + df['bulan'].astype(str) + '-' + df['tanggal'].astype(str),
                                   format='%Y-%m-%d')
df.sample(5)

Unnamed: 0,tahun,bulan,tanggal,stasiun,pm_sepuluh,pm_duakomalima,sulfur_dioksida,karbon_monoksida,ozon,nitrogen_dioksida,max,parameter_pencemar_kritis,kategori,date
195,2024,1,16,DKI5 Kebon Jeruk Jakarta Barat,23.0,49.0,24.0,5.0,43.0,6.0,49.0,PM25,BAIK,2024-01-16
104,2024,1,15,DKI2 Kelapa Gading,52.0,67.0,58.0,12.0,14.0,32.0,67.0,PM25,SEDANG,2024-01-15
200,2024,1,21,DKI5 Kebon Jeruk Jakarta Barat,26.0,51.0,37.0,10.0,18.0,5.0,51.0,PM25,SEDANG,2024-01-21
242,2024,4,23,DKI4 Lubang Buaya,63.0,84.0,29.0,23.0,11.0,6.0,84.0,PM25,SEDANG,2024-04-23
97,2024,4,11,DKI3 Jagakarsa,20.0,50.0,54.0,11.0,12.0,47.0,54.0,SO2,SEDANG,2024-04-11


In [53]:
# Keep "date", "sulfur_dioksida", and "karbon_monoksida" column, and rename "date" to "tanggal"
df = df.loc[:, ['date', 'sulfur_dioksida', 'karbon_monoksida', 'kategori']]
df = df.rename(columns={'date': 'tanggal'})
df.sample(5)

Unnamed: 0,tanggal,sulfur_dioksida,karbon_monoksida,kategori
182,2024-01-01,40.0,18.0,SEDANG
695,2024-05-24,24.0,22.0,SEDANG
122,2024-01-21,46.0,6.0,BAIK
130,2024-01-31,47.0,19.0,BAIK
401,2024-03-19,23.0,14.0,BAIK


In [54]:
# Remove duplicate rows
print(df.duplicated().sum())
df = df.drop_duplicates().reset_index(drop=True)
print(df.duplicated().sum())

5
0


In [55]:
# Save the cleaned data to a CSV file
cleaned_file_path = 'cleaned_data.csv'
df_save = df.copy()
df_save = df_save[['karbon_monoksida', 'sulfur_dioksida', 'kategori']]
df_save = df_save.rename(columns={'karbon_monoksida': 'mq2', 'sulfur_dioksida': 'mq136', 'kategori': 'category'})
df_save.to_csv(cleaned_file_path, index=False)

In [56]:
df_save

Unnamed: 0,mq2,mq136,category
0,8.0,46.0,SEDANG
1,7.0,47.0,SEDANG
2,7.0,50.0,SEDANG
3,8.0,48.0,SEDANG
4,12.0,48.0,SEDANG
...,...,...,...
696,18.0,28.0,SEDANG
697,21.0,28.0,TIDAK SEHAT
698,19.0,28.0,TIDAK SEHAT
699,17.0,28.0,SEDANG


# **ML Training**

In [57]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## **Load Dataset**

In [58]:
# Load the data
df_train = pd.read_csv('cleaned_data.csv')
df_train

Unnamed: 0,mq2,mq136,category
0,8.0,46.0,SEDANG
1,7.0,47.0,SEDANG
2,7.0,50.0,SEDANG
3,8.0,48.0,SEDANG
4,12.0,48.0,SEDANG
...,...,...,...
696,18.0,28.0,SEDANG
697,21.0,28.0,TIDAK SEHAT
698,19.0,28.0,TIDAK SEHAT
699,17.0,28.0,SEDANG


## **Encode Target**

In [59]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()
# Fit and transform the 'kategori' column
df_train['category'] = label_encoder.fit_transform(df_train['category'])
label_encoder.classes_

array(['BAIK', 'SEDANG', 'TIDAK SEHAT'], dtype=object)

In [60]:
# Save the encoded data to a new CSV file
df_train.to_csv('encoded_cleaned_data.csv', index=False)

In [61]:
# Load the encoded data
df_train_encoded = pd.read_csv('encoded_cleaned_data.csv')
df_train_encoded

Unnamed: 0,mq2,mq136,category
0,8.0,46.0,1
1,7.0,47.0,1
2,7.0,50.0,1
3,8.0,48.0,1
4,12.0,48.0,1
...,...,...,...
696,18.0,28.0,1
697,21.0,28.0,2
698,19.0,28.0,2
699,17.0,28.0,1


## **Feature Selection**

In [62]:
# Feature selection and preprocessing
X = df_train_encoded[['mq2', 'mq136']]
y = df_train_encoded['category']
y.value_counts()

category
1    574
0     74
2     53
Name: count, dtype: int64

## **Balancing**

In [63]:
# Balances the value of the target variable
balancer = RandomOverSampler(random_state=42)
X_balanced, y_balanced = balancer.fit_resample(X, y)
y_balanced.value_counts()

category
1    574
0    574
2    574
Name: count, dtype: int64

## **Split Dataset**

In [64]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced)
print(f"Data training: {len(X_train)}")
print(f"Data testing: {len(X_test)}")

Data training: 1377
Data testing: 345


## **Scaling**

In [65]:
X_train.describe()

Unnamed: 0,mq2,mq136
count,1377.0,1377.0
mean,16.105301,32.177923
std,7.985531,14.897992
min,3.0,7.0
25%,10.0,23.0
50%,15.0,27.0
75%,21.0,48.0
max,70.0,61.0


In [66]:
minmax_scaler = MinMaxScaler()
X_train_scaled = minmax_scaler.fit_transform(X_train)
X_test_scaled = minmax_scaler.transform(X_test)

## **Training**

In [67]:
# Train a random forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

In [68]:
y_train_pred = model.predict(X_train_scaled)
print(confusion_matrix(y_train, y_train_pred), "\n")
print(classification_report(y_train, y_train_pred))

[[459   0   0]
 [ 28 398  33]
 [  0   0 459]] 

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       459
           1       1.00      0.87      0.93       459
           2       0.93      1.00      0.97       459

    accuracy                           0.96      1377
   macro avg       0.96      0.96      0.95      1377
weighted avg       0.96      0.96      0.95      1377



## **Evaluate**

In [69]:
# Evaluate the model
y_test_pred = model.predict(X_test_scaled)
print(confusion_matrix(y_test, y_test_pred), "\n")
print(classification_report(y_test, y_test_pred))

[[115   0   0]
 [ 16  82  17]
 [  0   0 115]] 

              precision    recall  f1-score   support

           0       0.88      1.00      0.93       115
           1       1.00      0.71      0.83       115
           2       0.87      1.00      0.93       115

    accuracy                           0.90       345
   macro avg       0.92      0.90      0.90       345
weighted avg       0.92      0.90      0.90       345



# **ML Export**

In [71]:
import pickle
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [72]:
print(sklearn.__version__)

1.3.2


In [73]:
# Create machine learning pipeline
num_attribs = ['mq2', 'mq136']

pipeline = Pipeline([
  ('minmax_scaler', MinMaxScaler()),
  ('ml_model', RandomForestClassifier())
])

# Fit data on model pipeline
pipeline.fit(X_train, y_train)

In [74]:
# Predict on training data
y_train_pred = pipeline.predict(X_train)
print(confusion_matrix(y_train, y_train_pred), "\n")
print(classification_report(y_train, y_train_pred))

[[459   0   0]
 [ 28 398  33]
 [  0   0 459]] 

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       459
           1       1.00      0.87      0.93       459
           2       0.93      1.00      0.97       459

    accuracy                           0.96      1377
   macro avg       0.96      0.96      0.95      1377
weighted avg       0.96      0.96      0.95      1377



In [75]:
# Predict on test data
y_test_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_test_pred), "\n")
print(classification_report(y_test, y_test_pred))

[[115   0   0]
 [ 17  81  17]
 [  0   0 115]] 

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       115
           1       1.00      0.70      0.83       115
           2       0.87      1.00      0.93       115

    accuracy                           0.90       345
   macro avg       0.91      0.90      0.90       345
weighted avg       0.91      0.90      0.90       345



In [31]:
# Save the model
with open('random_forest.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [32]:
# Test load model
with open('random_forest.pkl', 'rb') as f:
  loaded_model = pickle.load(f)

In [76]:
# Test loaded model on new data
data = [{
  'mq2':26,
  'mq136':7
}]
feature = pd.DataFrame(data)

AIR_QUALITY = ['BAIK', 'SEDANG', 'TIDAK SEHAT']

# Predict on new data
y_test_pred = pipeline.predict(feature)
print(AIR_QUALITY[y_test_pred[0]])

BAIK
