# ML Model Scenario 1: Air Quality Index Data

## 01/01/2015 - 12/31/2019 
## 0% Reduction in Human Activity 

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

from sqlalchemy import create_engine
import psycopg2

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

## Read the Data from Postgres

In [4]:
pip install mysql-connector-python

Note: you may need to restart the kernel to use updated packages.


In [5]:
#Create a connection to the Postgres & AWS

# Import API key
from config import db_password

connection = psycopg2.connect(
    host = 'finalproject-ml.cvm9etk63tz8.us-west-1.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = db_password,
    database = 'postgres'
    )
cursor=connection.cursor()

In [6]:
#Read in Table for co_2015
aqi_all = "select * from aqi_all"

In [7]:
#Check DataFrame
aqi_all = pd.read_sql(aqi_all, con=connection)
aqi_all.head()

Unnamed: 0,Date,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
0,2015-01-01,7.0,37.0,26.0,55.0,
1,2015-01-02,8.0,42.0,20.0,73.0,
2,2015-01-03,10.0,45.0,17.0,84.0,
3,2015-01-04,10.0,40.0,19.0,86.0,
4,2015-01-05,9.0,42.0,12.0,69.0,


## Prepare Data

In [8]:
# Convert the Date column to type DATETIME
aqi_all['Date'] = pd.to_datetime(aqi_all['Date'])

In [9]:
# Filter Dates to create scenario #1

start_dateE1 = "2015-1-1"
end_dateE1 = "2019-12-31"

after_start_dateE1 = aqi_all["Date"] >= start_dateE1
before_end_dateE1 = aqi_all["Date"] <= end_dateE1
between_two_datesE1 = after_start_dateE1 & before_end_dateE1
df = aqi_all.loc[between_two_datesE1]

In [10]:
df.tail()

Unnamed: 0,Date,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
1821,2019-12-27,8.0,32.0,20.0,34.0,
1822,2019-12-28,8.0,25.0,18.0,38.0,
1823,2019-12-29,5.0,16.0,23.0,27.0,
1824,2019-12-30,7.0,25.0,19.0,31.0,
1825,2019-12-31,9.0,28.0,,41.0,


In [11]:
# Drop Column Date
df = df.drop(columns=["Date"])

In [12]:
# Create AQI_Max Column
df["AQI_Max"] = pd.DataFrame(df.max(axis=1))

In [13]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
0,7.0,37.0,26.0,55.0,,55.0
1,8.0,42.0,20.0,73.0,,73.0
2,10.0,45.0,17.0,84.0,,84.0
3,10.0,40.0,19.0,86.0,,86.0
4,9.0,42.0,12.0,69.0,,69.0


In [14]:
df.shape

(1826, 6)

In [15]:
# Summary statistics
df.describe()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
count,1798.0,1798.0,1799.0,1775.0,150.0,1825.0
mean,5.253059,20.679088,27.211784,33.701408,16.166667,39.814795
std,2.564891,13.102605,8.30429,22.039216,9.523843,18.148864
min,0.0,0.0,3.0,0.0,2.0,3.0
25%,3.0,9.0,22.0,19.0,9.25,30.0
50%,5.0,18.0,27.0,30.0,14.0,36.0
75%,7.0,30.0,32.0,45.0,21.0,46.0
max,18.0,71.0,108.0,228.0,61.0,228.0


In [16]:
# Count of AQI_Max where it is Unsafe
print(df[(df.AQI_Max > 50)].count())

aqi_co        344
aqi_no2       346
aqi_o3        342
aqi_pm_2_5    347
aqi_pm_10      27
AQI_Max       348
dtype: int64


In [17]:
# Count of AQI_Max where it is Safe
print(df[(df.AQI_Max <= 50)].count())

aqi_co        1454
aqi_no2       1452
aqi_o3        1457
aqi_pm_2_5    1428
aqi_pm_10      123
AQI_Max       1477
dtype: int64


In [18]:
df.dtypes

aqi_co        float64
aqi_no2       float64
aqi_o3        float64
aqi_pm_2_5    float64
aqi_pm_10     float64
AQI_Max       float64
dtype: object

In [19]:
#Handle missing values
df.isnull().sum()

aqi_co          28
aqi_no2         28
aqi_o3          27
aqi_pm_2_5      51
aqi_pm_10     1676
AQI_Max          1
dtype: int64

In [20]:
# Replace missing values with the mean of the column
df_column = ['aqi_co', 'aqi_no2', 'aqi_o3', 'aqi_pm_2_5', 'aqi_pm_10', 'AQI_Max']

for i in df_column:
    a = df[i].mean()
    df[i].replace(np.nan , a,inplace = True)

In [21]:
df.isnull().sum()

aqi_co        0
aqi_no2       0
aqi_o3        0
aqi_pm_2_5    0
aqi_pm_10     0
AQI_Max       0
dtype: int64

In [22]:
# Convert the target column values to safe and unsafe based on their values

for aqi in df['AQI_Max']:
    if (aqi > float(50)):
        df['AQI_Max']=df['AQI_Max'].replace(aqi, "Unsafe")
    else:
        df['AQI_Max']=df['AQI_Max'].replace(aqi, "Safe")

In [23]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
0,7.0,37.0,26.0,55.0,16.166667,Unsafe
1,8.0,42.0,20.0,73.0,16.166667,Unsafe
2,10.0,45.0,17.0,84.0,16.166667,Unsafe
3,10.0,40.0,19.0,86.0,16.166667,Unsafe
4,9.0,42.0,12.0,69.0,16.166667,Unsafe


In [24]:
df.dtypes

aqi_co        float64
aqi_no2       float64
aqi_o3        float64
aqi_pm_2_5    float64
aqi_pm_10     float64
AQI_Max        object
dtype: object

In [25]:
columns = ["aqi_co", "aqi_no2", "aqi_o3", "aqi_pm_2_5", "aqi_pm_10"]

target = ["AQI_Max"]

In [26]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
0,7.0,37.0,26.0,55.0,16.166667,Unsafe
1,8.0,42.0,20.0,73.0,16.166667,Unsafe
2,10.0,45.0,17.0,84.0,16.166667,Unsafe
3,10.0,40.0,19.0,86.0,16.166667,Unsafe
4,9.0,42.0,12.0,69.0,16.166667,Unsafe


## Split the Data into Training and Testing

In [27]:
# Create our features
X = df.drop(columns='AQI_Max')

#clean features
X = pd.get_dummies(X)

# Create our target
y = df.loc[:, target].copy()
X.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
0,7.0,37.0,26.0,55.0,16.166667
1,8.0,42.0,20.0,73.0,16.166667
2,10.0,45.0,17.0,84.0,16.166667
3,10.0,40.0,19.0,86.0,16.166667
4,9.0,42.0,12.0,69.0,16.166667


In [28]:
X.describe()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
count,1826.0,1826.0,1826.0,1826.0,1826.0
mean,5.253059,20.679088,27.211784,33.701408,16.166667
std,2.545139,13.001703,8.242633,21.729088,2.721284
min,0.0,0.0,3.0,0.0,2.0
25%,3.0,9.0,22.0,20.0,16.166667
50%,5.0,19.0,27.0,31.0,16.166667
75%,7.0,30.0,32.0,45.0,16.166667
max,18.0,71.0,108.0,228.0,61.0


In [29]:
# Check the balance of our target values
y["AQI_Max"].value_counts()

##We have imbalanced data.  Have significantly more observations in the class Safe.  Therefore we apply resampling
#techniques below

Safe      1478
Unsafe     348
Name: AQI_Max, dtype: int64

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

## Resampling


### Oversampling

#### Naive Random Oversampling

In [31]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'AQI_Max': 1})

In [32]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9524509803921568


In [34]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,341,31
Actual 1,1,84


In [35]:
cm_df = pd.DataFrame(confusion_matrix(y_test, y_pred),
     index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [36]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.92      0.99      0.96      0.95      0.90       372
     Unsafe       0.73      0.99      0.92      0.84      0.95      0.91        85

avg / total       0.95      0.93      0.97      0.93      0.95      0.90       457



#### SMOTE Oversampling

In [37]:
# Resample the training data with SMOTE

from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,sampling_strategy='auto').\
                            fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [38]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [39]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9479127134724857


In [40]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       0.99      0.92      0.98      0.96      0.95      0.89       372
     Unsafe       0.73      0.98      0.92      0.84      0.95      0.90        85

avg / total       0.95      0.93      0.97      0.93      0.95      0.89       457



#### Undersampling

In [42]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [43]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [44]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9479127134724857


In [45]:
confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [46]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       0.99      0.92      0.98      0.96      0.95      0.89       372
     Unsafe       0.73      0.98      0.92      0.84      0.95      0.90        85

avg / total       0.95      0.93      0.97      0.93      0.95      0.89       457



#### Combination (Over and Under) Sampling


In [47]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [48]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [49]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9470746363061353


In [50]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,337,35
Actual 1,1,84


In [51]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.91      0.99      0.95      0.95      0.89       372
     Unsafe       0.71      0.99      0.91      0.82      0.95      0.90        85

avg / total       0.94      0.92      0.97      0.93      0.95      0.89       457



## Ensemble Learners

### Balanced Random Forest Classifier

In [52]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [53]:
# Calculated the balanced accuracy score
predictions = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

1.0

In [54]:
# Display the confusion matrix

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,372,0
Actual 1,0,85


In [55]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00       372
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00        85

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       457



In [56]:
# List the features sorted in descending order by feature importance

# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
#importances


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.7454248366705853, 'aqi_pm_2_5'),
 (0.14038488228474236, 'aqi_no2'),
 (0.07176294434020804, 'aqi_co'),
 (0.034026707082495146, 'aqi_o3'),
 (0.00840062962196921, 'aqi_pm_10')]

### Easy Ensemble AdaBoost Classifier

In [57]:
# Train the EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fitting the model
ee_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [58]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

1.0


In [59]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,372,0
Actual 1,0,85


In [60]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00       372
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00        85

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       457



# Models Performance Summary

In [61]:
Summary_AQI_Max_Prediction_Safe = pd.DataFrame(
    {"Naive Random Oversampling":{'AccuracyScore': '94%', 'Precision': 0.99, "Recall":0.94,"F1-Score":96},
      "SMOTE Oversampling":{'AccuracyScore': '94%', 'Precision': 0.99, "Recall":0.94,"F1-Score":0.96},
     "ClusterCentroids":{'AccuracyScore': '95%', 'Precision': 0.99, "Recall":0.95,"F1-Score":0.97},
     "SMOTEEN":{'AccuracyScore': '94%', 'Precision': 0.99, "Recall":0.93,"F1-Score":0.96},
     "Random Forest Classifier":{'AccuracyScore': '1', 'Precision': 1, "Recall":1,"F1-Score":1},
     "Easy Ensemble Classifier -AdaBoost":{'AccuracyScore': '1', 'Precision': 1, "Recall":1,"F1-Score":1}
     })
    
Summary_AQI_Max_Prediction_Safe

Unnamed: 0,Naive Random Oversampling,SMOTE Oversampling,ClusterCentroids,SMOTEEN,Random Forest Classifier,Easy Ensemble Classifier -AdaBoost
AccuracyScore,94%,94%,95%,94%,1,1
Precision,0.99,0.99,0.99,0.99,1,1
Recall,0.94,0.94,0.95,0.93,1,1
F1-Score,96,0.96,0.97,0.96,1,1


In [62]:
Summary_AQI_Max_Prediction_Unsafe = pd.DataFrame(
    {"Naive Random Oversampling":{'AccuracyScore': '94%', 'Precision': 0.79, "Recall":0.95,"F1-Score":87},
      "SMOTE Oversampling":{'AccuracyScore': '94%', 'Precision': 0.80, "Recall":0.95,"F1-Score":0.86},
     "ClusterCentroids":{'AccuracyScore': '95%', 'Precision': 0.83, "Recall":0.95,"F1-Score":0.88},
     "SMOTEEN":{'AccuracyScore': '94%', 'Precision': 0.78, "Recall":0.96,"F1-Score":0.86},
     "Random Forest Classifier":{'AccuracyScore': '1', 'Precision': 1, "Recall":1,"F1-Score":1},
     "Easy Ensemble Classifier -AdaBoost":{'AccuracyScore': '1', 'Precision': 1, "Recall":1,"F1-Score":1}
     })
    
Summary_AQI_Max_Prediction_Unsafe

Unnamed: 0,Naive Random Oversampling,SMOTE Oversampling,ClusterCentroids,SMOTEEN,Random Forest Classifier,Easy Ensemble Classifier -AdaBoost
AccuracyScore,94%,94%,95%,94%,1,1
Precision,0.79,0.8,0.83,0.78,1,1
Recall,0.95,0.95,0.95,0.96,1,1
F1-Score,87,0.86,0.88,0.86,1,1
