# 1. Process Phase <a class= 'anchor' id = process_phase_4></a>

## 1.1 Installing packages and opening libraries<a class= 'anchor' id = installing_packages_and_opening_libraries_4_1></a>

In [None]:
import pandas as pd
import numpy as np
import requests
import seaborn as sns

In [None]:
#loading the dataset
df = pd.read_csv('/kaggle/input/aqi-dataset/BD_AQI.csv')

## 1.2 Importing and previewing our dataset<a class= 'anchor' id = importing_datasets_4_2></a>

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['Location'].value_counts()

## 1.3 Cleaning & Formatting<a class= 'anchor' id = cleaning_and_formatting_4_3></a>


In [None]:
df.columns= df.columns.str.lower()

In [None]:
df.columns = df.columns.str.replace(' ', '_')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.drop(df[df['date'] == '59:59.5'].index, inplace=True)
df.drop(df[df['date'] == '59:59.6'].index, inplace=True)
df.drop(df[df['date'] == '59:59.7'].index, inplace=True)
df.drop(df[df['date'] == '00:00.0'].index, inplace=True)
df.drop(df[df['date'] == '00:00.1'].index, inplace=True)

In [None]:
#df.rename(columns={'date_(lt)': 'date'}, inplace=True)
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y %H:%M')


In [None]:
df.info()

converted to date time

In [None]:
df.isnull().sum()

In [None]:
df.drop(df[df['time'] == '24:00:00'].index, inplace=True)

In [None]:
df['time'] = df['time'].str.split(':').str[0]

In [None]:
df['time'] = df['time'].astype(int)

In [None]:
# Rename 'time' column to 'hour'
df.rename(columns={'time': 'hour'}, inplace=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# Rename 'aqi_severity' column to 'aqi_category'
df.rename(columns={'aqi_severity': 'aqi_category'}, inplace=True)

In [None]:
df.columns

In [None]:
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df.head()

## 1.4 EDA/Feature Engineering<a class= 'anchor' id = EDA></a>

In [None]:
df.head()

## 1.4.1 Handling Categorical Data<a class= 'anchor' id = HCD></a>

**Label encoding of our target label**

Since, its an ordinal category so we have to label encode it

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
LE = LabelEncoder()

In [None]:
df['aqi_le'] = LE.fit_transform(df['aqi_category'])

In [None]:
df['aqi_category'].value_counts()

In [None]:
df.head()

In [None]:
# Grouping the dataframe by 'location'
grouped = df.groupby(df['location'])

In [None]:
df['location'].value_counts()

## 1.4.2 Handling Data Based on Locations<a class= 'anchor' id = HCD></a>

In [None]:
# Create separate dataframes based on each location
dhk_df = grouped.get_group('Dhaka')
raj_df = grouped.get_group('Rajshahi')
bar_df = grouped.get_group('Barishal')
gazi_df = grouped.get_group('Gazipur')
syl_df = grouped.get_group('Sylhet')
nar_df = grouped.get_group('Narayonganj')
kh_df = grouped.get_group('Khulna')
ctg_df = grouped.get_group('Chittagong')


In [None]:
dhk_df.info()

In [None]:
#dropping location column
dhk_df.drop(columns = 'location', inplace = True)
raj_df.drop(columns = 'location', inplace = True)
bar_df.drop(columns = 'location', inplace = True)
gazi_df.drop(columns = 'location', inplace = True)
syl_df.drop(columns = 'location', inplace = True)
nar_df.drop(columns = 'location', inplace = True)
kh_df.drop(columns = 'location', inplace = True)
ctg_df.drop(columns = 'location', inplace = True)

In [None]:
kh_df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=dhk_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Dhaka', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=raj_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Rajshahi', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=bar_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Barishal', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=gazi_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Gazipur', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=syl_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Sylhet', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=nar_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Narayanganj', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=kh_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Khulna', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize = (17,5))
sns.countplot(x=ctg_df['aqi_category'])

plt.xlabel('AQI Category', fontweight = 'bold' )
plt.ylabel('Count', fontweight = 'bold' )
plt.title('Count of AQI Categories Chittagong', fontweight = 'bold' )
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

## 1.4.3 Manual Label Mapping<a class= 'anchor' id = HCD></a>

In [None]:
# Defining the mapping of numerical values to AQI categories
aqi_ranking = {
    'Good': 0,
    'Unhealthy for Sensitive Groups': 1,
    'Moderate': 2,
    'Unhealthy': 3,
    'Very Unhealthy': 4,
    'Hazardous': 5
}

# Create the new column 'aqi_map' and map the numerical values to AQI categories
dhk_df['aqi_map'] = dhk_df['aqi_category'].replace(aqi_ranking)
raj_df['aqi_map'] = raj_df['aqi_category'].replace(aqi_ranking)
bar_df['aqi_map'] = bar_df['aqi_category'].replace(aqi_ranking)
gazi_df['aqi_map'] = gazi_df['aqi_category'].replace(aqi_ranking)
syl_df['aqi_map'] = syl_df['aqi_category'].replace(aqi_ranking)
nar_df['aqi_map'] = nar_df['aqi_category'].replace(aqi_ranking)
kh_df['aqi_map'] = kh_df['aqi_category'].replace(aqi_ranking)
ctg_df['aqi_map'] = ctg_df['aqi_category'].replace(aqi_ranking)

In [None]:
dhk_df.head()

In [None]:
# Dropping unncessaey columns
dhk_df = dhk_df.drop(columns = ['aqi_le', 'aqi_category'])
raj_df = raj_df.drop(columns = ['aqi_le', 'aqi_category'])
bar_df = bar_df.drop(columns = ['aqi_le', 'aqi_category'])
gazi_df = gazi_df.drop(columns = ['aqi_le', 'aqi_category'])
syl_df = syl_df.drop(columns = ['aqi_le', 'aqi_category'])
nar_df = nar_df.drop(columns = ['aqi_le', 'aqi_category'])
kh_df = kh_df.drop(columns = ['aqi_le', 'aqi_category'])
ctg_df = ctg_df.drop(columns = ['aqi_le', 'aqi_category'])

In [None]:
dhk_df

# 2 Analyze & share phase<a class= 'anchor' id = asp></a>

## 2.1 Normalizing Numerical Values<a class= 'anchor' id = HCD></a>

In [None]:
# Applying min-max normalizatiopn to normalize the values within 0-1
columns_to_normalize = ['so2','no','no2','nox','co','o3','pm2.5','pm10','temperature','rh','solar_rad','aqi']  # Specify columns to normalize
dhk_df[columns_to_normalize] = dhk_df[columns_to_normalize].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [None]:
dhk_df

## 2.2 Feature Selection<a class= 'anchor' id = fs></a>

### 2.2.1 Correlation<a class= 'anchor' id = corr></a>


In [None]:
X= dhk_df.drop(columns = ['aqi_map','date'])
y = dhk_df['aqi_map']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Adjust the figure size
plt.figure(figsize=(14, 10))  # Adjust width and height as desired

# Create the heatmap
sns.heatmap(dhk_df.corr(), annot=True, cmap='viridis')

# Show the plot
plt.show()


## 2.3 Models<a class= 'anchor' id = rfr></a>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

## 2.3.1 Ensembling(Random Forest Regressor)<a class= 'anchor' id = rfr></a>

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Prepare features (X) and target (y)
X = dhk_df.drop(columns=['aqi_map', 'date', 'aqi'])  # Drop irrelevant columns
y = dhk_df['aqi']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train, y_train)

# Predictions
y_pred_train = rfr.predict(X_train)
y_pred_test = rfr.predict(X_test)

# Evaluate performance
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)

# Display results
print("Model Performance for Predicting AQI(Chittagong):")
print(f"R² on Training Set: {train_r2:.4f}")
print(f"R² on Test Set: {test_r2:.4f}")
print(f"MAE on Training Set: {train_mae:.4f}")
print(f"MAE on Test Set: {test_mae:.4f}")
print(f"MSE on Training Set: {train_mse:.4f}")
print(f"MSE on Test Set: {test_mse:.4f}")

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Prepare features (X) and target (y)
X = dhk_df.drop(columns=['aqi_map', 'date', 'aqi'])  # Drop irrelevant columns
y = dhk_df['aqi']  # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
gbr = RandomForestRegressor(random_state=42)
gbr.fit(X_train, y_train)

# Predictions
y_pred_train = rfr.predict(X_train)
y_pred_test = rfr.predict(X_test)

# Evaluate performance
train_r2 = r2_score(y_train, y_pred_train)
test_r2 = r2_score(y_test, y_pred_test)
train_mae = mean_absolute_error(y_train, y_pred_train)
test_mae = mean_absolute_error(y_test, y_pred_test)
train_mse = mean_squared_error(y_train, y_pred_train)
test_mse = mean_squared_error(y_test, y_pred_test)

# Display results
print("Model Performance for Predicting AQI(Chittagong):")
print(f"R² on Training Set: {train_r2:.4f}")
print(f"R² on Test Set: {test_r2:.4f}")
print(f"MAE on Training Set: {train_mae:.4f}")
print(f"MAE on Test Set: {test_mae:.4f}")
print(f"MSE on Training Set: {train_mse:.4f}")
print(f"MSE on Test Set: {test_mse:.4f}")

## 2.3.2 Ensembling(Random Forest Classifier)<a class= 'anchor' id = rfr></a>

In [None]:
# Prepare features (X) and target (y)
X = dhk_df.drop(columns=['aqi_map', 'date', 'aqi'])  # Drop irrelevant columns
y = dhk_df['aqi_map']  # Target variable (categorical)

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as pl

# Initialize and train the Random Forest Classifier
rfc = RandomForestClassifier(
    random_state=42,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    n_estimators=100
)

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
train_accuracies = []
test_accuracies = []

# Initialize lists to store true labels and predictions across folds
all_true_labels = []
all_pred_labels = []

# Perform manual cross-validation
for train_index, test_index in cv.split(X, y):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    rfc.fit(X_train_cv, y_train_cv)

    # Evaluate on training and test sets
    train_accuracy = rfc.score(X_train_cv, y_train_cv)
    test_accuracy = rfc.score(X_test_cv, y_test_cv)

    # Store results
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

    # Predict on the test fold
    y_test_pred = rfc.predict(X_test_cv)

    # Store true labels and predictions
    all_true_labels.extend(y_test_cv)
    all_pred_labels.extend(y_test_pred)

# Display cross-validation results
print("\nCross-Validation Results:")
print(f"Training Accuracy for each fold: {train_accuracies}")
print(f"Test Accuracy for each fold: {test_accuracies}")
print(f"Mean Training Accuracy: {np.mean(train_accuracies):.4f}")
print(f"Mean Test Accuracy: {np.mean(test_accuracies):.4f}")
print(f"Standard Deviation of Training Accuracy: {np.std(train_accuracies):.4f}")
print(f"Standard Deviation of Test Accuracy: {np.std(test_accuracies):.4f}")

# Generate the overall classification report
print("\nClassification Report (Overall):")
print(classification_report(all_true_labels, all_pred_labels))

# Generate the overall confusion matrix
overall_cm = confusion_matrix(all_true_labels, all_pred_labels)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(overall_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title("Overall Confusion Matrix (Cross-Validation)")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

## 2.3.2 Ensembling(Gradient Boosting Classifier)<a class= 'anchor' id = rfr></a>

In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as pl

# Initialize and train the Gradient Boosting Classifier
gbc = GradientBoostingClassifier(
    max_depth=5, 
    min_samples_split=20, 
    min_samples_leaf=10, 
    learning_rate=0.1, 
    n_estimators=100, 
    random_state=42
)

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store results
train_accuracies = []
test_accuracies = []

# Initialize lists to store true labels and predictions across folds
all_true_labels = []
all_pred_labels = []

# Perform manual cross-validation
for train_index, test_index in cv.split(X, y):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    y_train_cv, y_test_cv = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    gbc.fit(X_train_cv, y_train_cv)

    # Evaluate on training and test sets
    train_accuracy = gbc.score(X_train_cv, y_train_cv)
    test_accuracy = gbc.score(X_test_cv, y_test_cv)

    # Store results
    train_accuracies.append(train_accuracy)
    test_accuracies.append(test_accuracy)

    # Predict on the test fold
    y_test_pred = gbc.predict(X_test_cv)

    # Store true labels and predictions
    all_true_labels.extend(y_test_cv)
    all_pred_labels.extend(y_test_pred)

# Display cross-validation results
print("\nCross-Validation Results:")
print(f"Training Accuracy for each fold: {train_accuracies}")
print(f"Test Accuracy for each fold: {test_accuracies}")
print(f"Mean Training Accuracy: {np.mean(train_accuracies):.4f}")
print(f"Mean Test Accuracy: {np.mean(test_accuracies):.4f}")
print(f"Standard Deviation of Training Accuracy: {np.std(train_accuracies):.4f}")
print(f"Standard Deviation of Test Accuracy: {np.std(test_accuracies):.4f}")

# Generate the overall classification report
print("\nClassification Report (Overall):")
print(classification_report(all_true_labels, all_pred_labels))

# Generate the overall confusion matrix
overall_cm = confusion_matrix(all_true_labels, all_pred_labels)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(overall_cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title("Overall Confusion Matrix (Cross-Validation)")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()