### Overview
 https://aqicn.org/scale/
 
 	0 - 50 (Good): Air quality is considered satisfactory, and air pollution poses little or no risk.
	51 - 100 (Moderate): Air quality is acceptable; however, some pollutants might pose a moderate health concern for a very small number of people who are unusually sensitive to air pollution.
	101 - 150 (Unhealthy for Sensitive Groups): Members of sensitive groups may experience health effects. The general public is less likely to be affected.
	151 - 200 (Unhealthy): Everyone may begin to experience health effects, with sensitive groups possibly experiencing more serious effects.
	201 - 300 (Very Unhealthy): Health warnings of emergency conditions. The entire population is more likely to be affected.
	300+ (Hazardous): Health alert: everyone may experience more serious health effects.


In [58]:
import pandas as pd

#### Read Data + Cleaning

In [27]:
# Load the data
file_path = 'data/naivebayespolution.csv'
data = pd.read_csv(file_path)

# Clean the data
# Dropping unnecessary columns
data_cleaned = data[['date', 'pm 2.5', 'stasiun']].copy()


# Remove rows with missing PM 2.5 values
data_cleaned = data_cleaned.dropna(subset=['pm 2.5'])

# Convert the 'pm 2.5' column to numeric
data_cleaned['pm 2.5'] = pd.to_numeric(data_cleaned['pm 2.5'], errors='coerce')

print(data_cleaned.head(10))

      date  pm 2.5               stasiun
0  9/27/23    65.0  Balikpapan Sepinggan
1  9/28/23    63.0  Balikpapan Sepinggan
2  9/29/23    57.0  Balikpapan Sepinggan
3  9/30/23    46.0  Balikpapan Sepinggan
4  10/1/23    37.0  Balikpapan Sepinggan
5  10/2/23    55.0  Balikpapan Sepinggan
6  10/3/23    54.0  Balikpapan Sepinggan
7  10/4/23    62.0  Balikpapan Sepinggan
8  10/5/23    64.0  Balikpapan Sepinggan
9  10/6/23    63.0  Balikpapan Sepinggan


#### Labelling Level AQI

In [59]:
# Define the function to label the pollution level
def label_aqi(pm25):
    if pm25 <= 50:
        return 'Good'
    elif pm25 <= 100:
        return 'Moderate'
    elif pm25 <= 150:
        return 'Unhealthy for Sensitive Groups'
    elif pm25 <= 200:
        return 'Unhealthy'
    elif pm25 <= 300:
        return 'Very Unhealthy'
    else:
        return 'Hazardous'

# Apply the function to create a new column with labels
data_cleaned['Pollution Level'] = data_cleaned['pm 2.5'].apply(label_aqi)

# Display the labeled data
data_cleaned.head()

Unnamed: 0,date,pm 2.5,stasiun,Pollution Level
0,2023-09-27,65.0,Balikpapan Sepinggan,Moderate
1,2023-09-28,63.0,Balikpapan Sepinggan,Moderate
2,2023-09-29,57.0,Balikpapan Sepinggan,Moderate
3,2023-09-30,46.0,Balikpapan Sepinggan,Good
4,2023-10-01,37.0,Balikpapan Sepinggan,Good


#### Giving ID Stasiun

In [63]:
# Assign unique ID to each station
data_cleaned['id_stasiun'] = data_cleaned['stasiun'].factorize()[0] + 1

# Reorder columns to place 'id_stasiun' next to 'stasiun'
data_cleaned = data_cleaned[['date', 'pm 2.5', 'stasiun', 'id_stasiun', 'Pollution Level']]

# Optionally, save the cleaned and labeled data to a new CSV file
output_file_path = 'data/labeled_pollution_data.csv'
data_cleaned.to_csv(output_file_path, index=False)

# Display the result
print(data_cleaned.head())

        date  pm 2.5               stasiun  id_stasiun Pollution Level
0 2023-09-27    65.0  Balikpapan Sepinggan           1        Moderate
1 2023-09-28    63.0  Balikpapan Sepinggan           1        Moderate
2 2023-09-29    57.0  Balikpapan Sepinggan           1        Moderate
3 2023-09-30    46.0  Balikpapan Sepinggan           1            Good
4 2023-10-01    37.0  Balikpapan Sepinggan           1            Good


###  **** Implementasi Naive Bayes ****
#### Use Gaussian Naive Bayes classifier (read readme)

In [106]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np


#### Load data cleaned file (previous step)

In [107]:

file_path = 'data/labeled_pollution_data.csv'
data = pd.read_csv(file_path)

# Load the data
data_cleaned

# Clean the data: Keep relevant columns and handle missing values
data_cleaned = data[['date', 'pm 2.5', 'stasiun']].copy()
data_cleaned = data_cleaned.dropna(subset=['pm 2.5'])
data_cleaned['pm 2.5'] = pd.to_numeric(data_cleaned['pm 2.5'], errors='coerce')


In [108]:
# Convert date to datetime format
data_cleaned['date'] = pd.to_datetime(data_cleaned['date'])

#### Model for prediction monthly -> M

In [109]:
# Aggregate data by month and station (you can change 'M' to 'W' for weekly)
data_aggregated = data_cleaned.groupby([data_cleaned['date'].dt.to_period('M'), 'stasiun']).agg({'pm 2.5': 'mean'}).reset_index()

# Debugging step: Print the columns to check if 'stasiun' exists
print("Columns after aggregation:", data_aggregated.columns)


Columns after aggregation: Index(['date', 'stasiun', 'pm 2.5'], dtype='object')


In [110]:
# Encode the station names AFTER aggregation
label_encoder = LabelEncoder()
data_aggregated['stasiun_encoded'] = label_encoder.fit_transform(data_aggregated['stasiun'])


In [111]:
# Debugging step: Verify the stasiun_encoded column
print("Columns after encoding:", data_aggregated.columns)
print(data_aggregated.head())  # Verify first few rows


Columns after encoding: Index(['date', 'stasiun', 'pm 2.5', 'stasiun_encoded'], dtype='object')
      date                          stasiun   pm 2.5  stasiun_encoded
0  2023-09             Balikpapan Sepinggan   57.750                0
1  2023-09             Balikpapan sepinggan  333.000                1
2  2023-09        Banjar Baru Landasan Ulin  161.875                2
3  2023-09           Banjarmasin Kayu Tangi  130.750                3
4  2023-09  Kabupaten Barito Selatan Sanggu  165.250                4


#### Labelling

In [112]:

# Define a function to label pollution levels based on AQI
def label_aqi(pm25):
    if pm25 <= 50:
        return 0  # Good
    elif pm25 <= 100:
        return 1  # Moderate
    elif pm25 <= 150:
        return 2  # Unhealthy for Sensitive Groups
    elif pm25 <= 200:
        return 3  # Unhealthy
    elif pm25 <= 300:
        return 4  # Very Unhealthy
    else:
        return 5  # Hazardous

# Apply the labeling function to aggregated data
data_aggregated['Pollution Level'] = data_aggregated['pm 2.5'].apply(label_aqi)

# Create a dictionary to map numeric levels to descriptive labels
level_description = {
    0: 'Good',
    1: 'Moderate',
    2: 'Unhealthy for Sensitive Groups',
    3: 'Unhealthy',
    4: 'Very Unhealthy',
    5: 'Hazardous'
}

print(data_aggregated['Pollution Level'])

0     1
1     5
2     3
3     2
4     3
5     2
6     0
7     3
8     2
9     1
10    2
11    1
12    2
13    1
14    2
15    1
16    2
17    1
18    1
19    1
20    1
21    1
22    0
23    1
24    1
25    2
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    2
34    1
35    1
36    1
37    2
38    2
39    2
40    3
41    3
42    3
43    2
44    2
45    1
46    2
47    2
48    1
49    1
50    1
51    1
52    1
53    0
Name: Pollution Level, dtype: int64


In [113]:
# Add a column with the descriptive pollution level
data_aggregated['Pollution Level Description'] = data_aggregated['Pollution Level'].map(level_description)

# Debugging step: Verify that the column was created successfully
print("Data after adding Pollution Level Description:")
print(data_aggregated.head())


Data after adding Pollution Level Description:
      date                          stasiun   pm 2.5  stasiun_encoded  \
0  2023-09             Balikpapan Sepinggan   57.750                0   
1  2023-09             Balikpapan sepinggan  333.000                1   
2  2023-09        Banjar Baru Landasan Ulin  161.875                2   
3  2023-09           Banjarmasin Kayu Tangi  130.750                3   
4  2023-09  Kabupaten Barito Selatan Sanggu  165.250                4   

   Pollution Level     Pollution Level Description  
0                1                        Moderate  
1                5                       Hazardous  
2                3                       Unhealthy  
3                2  Unhealthy for Sensitive Groups  
4                3                       Unhealthy  


In [114]:
# Prepare features and labels for the model
X = data_aggregated[['stasiun_encoded', 'pm 2.5']]

In [115]:
y = data_aggregated['Pollution Level']

#### Training Process

In [116]:
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate and print the percentage of training and testing data
train_percentage = len(X_train) / len(X) * 100
test_percentage = len(X_test) / len(X) * 100

print(f"Training Data: {train_percentage:.2f}% ({len(X_train)} samples)")
print(f"Testing Data: {test_percentage:.2f}% ({len(X_test)} samples)\n")

Training Data: 79.63% (43 samples)
Testing Data: 20.37% (11 samples)



In [118]:
# Train the Gaussian Naive Bayes classifier
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict the pollution level for the test data
y_pred = classifier.predict(X_test)

# Evaluate the model
print("Model accuracy on test data:", classifier.score(X_test, y_test))

Model accuracy on test data: 0.9090909090909091


#### Print out result

In [2]:

# Predict the pollution level for the aggregated data
data_aggregated['Predicted Pollution Level'] = classifier.predict(X)

# Add a descriptive label for predicted pollution levels
data_aggregated['Predicted Pollution Level Description'] = data_aggregated['Predicted Pollution Level'].map(level_description)

# Debugging step: Verify that the predicted description column was created successfully
print("Data after adding Predicted Pollution Level Description:")
print(data_aggregated.head())

# Prepare the output: grouped by station and period with the classification result
output = data_aggregated[['date', 'stasiun', 'pm 2.5', 'Pollution Level', 'Pollution Level Description', 'Predicted Pollution Level', 'Predicted Pollution Level Description']]

NameError: name 'classifier' is not defined

In [1]:
# Optionally, save the output to a CSV file
output_file_path = 'data/result_station_classification_monthly.csv'
output.to_csv(output_file_path, index=False)
# Display the first few rows of the output
print(output.head())

NameError: name 'output' is not defined