In [None]:
 #IMPORT LIBRARY

import numpy as np # for numeric operations
import pandas as pd # for data manipulation operations
import statsmodels.api as plt # for building of models
import matplotlib.pyplot as plt # for math operation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

import seaborn as sns # for advanced data visualizations

sns.set() # activate

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks")

In [None]:
dataset = pd.read_excel ('AQIJakarta.xlsx')
dataset.head()

Unnamed: 0,tanggal,stasiun,pm10,so2,co,o3,no2,max,critical,categori
0,2019-01-01,DKI1 (Bunderan HI),29.0,15.0,5.0,,13.0,29,PM10,BAIK
1,2019-01-02,DKI1 (Bunderan HI),24.0,17.0,5.0,,6.0,24,PM10,BAIK
2,2019-01-03,DKI1 (Bunderan HI),16.0,16.0,5.0,29.0,4.0,29,O3,BAIK
3,2019-01-04,DKI1 (Bunderan HI),38.0,18.0,8.0,24.0,,38,PM10,BAIK
4,2019-01-05,DKI1 (Bunderan HI),37.0,29.0,16.0,,16.0,37,PM10,BAIK


In [None]:
data_drop = dataset.drop(['tanggal','stasiun','max','critical'],axis=1)

In [None]:
data_drop.describe(include='all')

Unnamed: 0,pm10,so2,co,o3,no2,categori
count,5182.0,5138.0,5204.0,5204.0,5198.0,5292
unique,,,,,,4
top,,,,,,SEDANG
freq,,,,,,3750
mean,53.413933,24.923122,16.056111,56.065527,14.757791,
std,14.682663,14.796699,11.332665,33.428236,13.225637,
min,5.0,1.0,2.0,3.0,1.0,
25%,45.0,14.0,10.0,30.0,8.0,
50%,55.0,22.0,14.0,51.0,11.0,
75%,63.0,32.0,19.0,75.0,17.0,


In [None]:
print(data_drop)

      pm10   so2    co    o3   no2 categori
0     29.0  15.0   5.0   NaN  13.0     BAIK
1     24.0  17.0   5.0   NaN   6.0     BAIK
2     16.0  16.0   5.0  29.0   4.0     BAIK
3     38.0  18.0   8.0  24.0   NaN     BAIK
4     37.0  29.0  16.0   NaN  16.0     BAIK
...    ...   ...   ...   ...   ...      ...
5287  54.0  36.0  14.0  21.0  47.0   SEDANG
5288  44.0  20.0  11.0  21.0  33.0   SEDANG
5289  34.0  28.0   8.0  25.0  29.0   SEDANG
5290  53.0  25.0  15.0  23.0  44.0   SEDANG
5291  60.0  28.0  19.0  30.0  53.0   SEDANG

[5292 rows x 6 columns]


In [None]:
data_drop.isnull().sum() #.sum() to see the number of missing in each variable

#False (0) is the data exist, but True (1) is data missing
#There are 2 variable have missing data value, that are pm10 (68), so2 (43), co (77), o3 (63), no2(66) from 4173 and 4345 (5%) total calculated data

Unnamed: 0,0
pm10,110
so2,154
co,88
o3,88
no2,94
categori,0


In [None]:
# Mengganti nilai yang kosong/NaN dengan nilai 0
data_filled = data_drop.fillna(0)

In [None]:
print(data_filled)

      pm10   so2    co    o3   no2 categori
0     29.0  15.0   5.0   0.0  13.0     BAIK
1     24.0  17.0   5.0   0.0   6.0     BAIK
2     16.0  16.0   5.0  29.0   4.0     BAIK
3     38.0  18.0   8.0  24.0   0.0     BAIK
4     37.0  29.0  16.0   0.0  16.0     BAIK
...    ...   ...   ...   ...   ...      ...
5287  54.0  36.0  14.0  21.0  47.0   SEDANG
5288  44.0  20.0  11.0  21.0  33.0   SEDANG
5289  34.0  28.0   8.0  25.0  29.0   SEDANG
5290  53.0  25.0  15.0  23.0  44.0   SEDANG
5291  60.0  28.0  19.0  30.0  53.0   SEDANG

[5292 rows x 6 columns]


In [None]:
data_filled.isnull().sum()

Unnamed: 0,0
pm10,0
so2,0
co,0
o3,0
no2,0
categori,0


In [None]:
x = data_filled.iloc[:, :5].values #mengambil dari kolom 0-4
y = data_filled.iloc[:, 5].values #mengambil kolom pada indeks ke 5

In [None]:
print(x)

[[29. 15.  5.  0. 13.]
 [24. 17.  5.  0.  6.]
 [16. 16.  5. 29.  4.]
 ...
 [34. 28.  8. 25. 29.]
 [53. 25. 15. 23. 44.]
 [60. 28. 19. 30. 53.]]


In [None]:
print(y)

['BAIK' 'BAIK' 'BAIK' ... 'SEDANG' 'SEDANG' 'SEDANG']


In [None]:
# Persiapkan data pelatihan dan uji
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [None]:
print(X_train)

[[ 55.  78.  21.  16.  67.]
 [ 60.  17.  24.  71.   8.]
 [ 68.   8.  21. 108.  15.]
 ...
 [  0.  22.   0.  46.  14.]
 [ 48.  17.  20.  53.   4.]
 [ 65.  19.   6.  72.  12.]]


In [None]:
print(X_test)

[[24. 18. 22. 73.  9.]
 [54. 10. 36.  0. 12.]
 [53.  6. 23. 31. 13.]
 ...
 [50.  9.  9. 59.  9.]
 [57. 22. 17. 83. 10.]
 [19.  7.  5. 40.  2.]]


In [None]:
print(y_train)

['SEDANG' 'SEDANG' 'TIDAK SEHAT' ... 'BAIK' 'SEDANG' 'SEDANG']


In [None]:
print(y_test)

['SEDANG' 'SEDANG' 'SEDANG' ... 'SEDANG' 'SEDANG' 'BAIK']


In [None]:
# Membuat model Random Forest
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)

In [None]:
# Membuat prediksi
y_pred = random_forest_model.predict(X_test)

In [None]:
# Membuat confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

In [None]:
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[127   0   9   0]
 [  0   3   0   0]
 [ 14   0 728   5]
 [  0   0  32 141]]


In [None]:
# Mengambil nilai dari confusion matrix
TP = conf_matrix[3][3]  # True Positive
TN = conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[1][0] + conf_matrix[1][1] + conf_matrix[2][0] + conf_matrix[2][1] + conf_matrix[2][3] + conf_matrix[3][0] + conf_matrix[3][1] + conf_matrix[3][2]  # True Negative
FP = conf_matrix[0][2] + conf_matrix[1][2] + conf_matrix[2][2]  # False Positive
FN = conf_matrix[2][3] + conf_matrix[3][2]  # False Negative

In [None]:
print(f"True Positive (TP): {TP}")
print(f"True Negative (TN): {TN}")
print(f"False Positive (FP): {FP}")
print(f"False Negative (FN): {FN}")

True Positive (TP): 141
True Negative (TN): 181
False Positive (FP): 737
False Negative (FN): 37


In [None]:
# Menghitung Total Positif dan Total Negatif
total_positif = TP + FN
total_negatif = TN + FP

In [None]:
# Menghitung Persentase TP, TN, FP, FN
persentase_TP = (TP / total_positif) * 100
persentase_TN = (TN / total_negatif) * 100
persentase_FP = (FP / total_negatif) * 100
persentase_FN = (FN / total_positif) * 100

In [None]:
print(f"Persentase True Positive (TP): {persentase_TP:.2f}%")
print(f"Persentase True Negative (TN): {persentase_TN:.2f}%")
print(f"Persentase False Positive (FP): {persentase_FP:.2f}%")
print(f"Persentase False Negative (FN): {persentase_FN:.2f}%")

Persentase True Positive (TP): 79.21%
Persentase True Negative (TN): 19.72%
Persentase False Positive (FP): 80.28%
Persentase False Negative (FN): 20.79%


In [None]:
# Menampilkan laporan klasifikasi
class_report = classification_report(y_test, y_pred)
print("\nClassification Report:")
print(class_report)


Classification Report:
                    precision    recall  f1-score   support

              BAIK       0.90      0.93      0.92       136
SANGAT TIDAK SEHAT       1.00      1.00      1.00         3
            SEDANG       0.95      0.97      0.96       747
       TIDAK SEHAT       0.97      0.82      0.88       173

          accuracy                           0.94      1059
         macro avg       0.95      0.93      0.94      1059
      weighted avg       0.94      0.94      0.94      1059

