## Scenario 3: 03/04/2020 - 08/29/2020 
## 80% Reduction in Human Activity - Widespread

## Import Libraries

In [63]:
import warnings
warnings.filterwarnings('ignore')

In [64]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

from sqlalchemy import create_engine
import psycopg2

In [65]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

## Read the Data from Postgres

In [66]:
pip install mysql-connector-python

Note: you may need to restart the kernel to use updated packages.


In [67]:
#Create a connection to the Postgres & AWS

# Import API key
from config import db_password

connection = psycopg2.connect(
    host = 'finalproject-ml.cvm9etk63tz8.us-west-1.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = db_password,
    database = 'postgres'
    )
cursor=connection.cursor()

In [68]:
#Read in Table for co_2015
aqi_all = "select * from aqi_all"

In [69]:
#Check DataFrame
aqi_all = pd.read_sql(aqi_all, con=connection)
aqi_all.head()

Unnamed: 0,Date,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
0,2015-01-01,7.0,37.0,26.0,55.0,
1,2015-01-02,8.0,42.0,20.0,73.0,
2,2015-01-03,10.0,45.0,17.0,84.0,
3,2015-01-04,10.0,40.0,19.0,86.0,
4,2015-01-05,9.0,42.0,12.0,69.0,


## Prepare Data

In [70]:
# Convert the Date column to type DATETIME
aqi_all['Date'] = pd.to_datetime(aqi_all['Date'])

In [71]:
## Applying filters to dates

start_dateE3 = "2020-03-04"
end_dateE3 = "2020-08-29"

after_start_dateE3 = aqi_all["Date"] >= start_dateE3
before_end_dateE3 = aqi_all["Date"] <= end_dateE3
between_two_datesE3 = after_start_dateE3 & before_end_dateE3
df = aqi_all.loc[between_two_datesE3]
df.shape

(179, 6)

In [72]:
df.tail()

Unnamed: 0,Date,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
2063,2020-08-25,7.0,7.0,27.0,77.0,
2064,2020-08-26,2.0,4.0,25.0,33.0,
2065,2020-08-27,3.0,5.0,23.0,33.0,
2066,2020-08-28,13.0,6.0,24.0,96.0,
2067,2020-08-29,6.0,6.0,23.0,57.0,


In [73]:
# Drop Column Date
df = df.drop(columns=["Date"])

In [74]:
# Create AQI_Max Column
df["AQI_Max"] = pd.DataFrame(df.max(axis=1))

In [75]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
1889,8.0,25.0,34.0,54.0,28.0,54.0
1890,6.0,21.0,34.0,39.0,,39.0
1891,6.0,6.0,33.0,10.0,,33.0
1892,6.0,8.0,36.0,4.0,,36.0
1893,6.0,18.0,36.0,15.0,,36.0


In [76]:
df.shape

(179, 6)

In [77]:
# Summary statistics
df.describe()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
count,179.0,178.0,179.0,179.0,15.0,179.0
mean,3.111732,9.505618,29.849162,30.513966,17.933333,38.067039
std,1.695608,7.636699,6.097028,18.419383,11.859093,13.733252
min,1.0,1.0,14.0,4.0,4.0,15.0
25%,2.0,4.0,25.5,18.0,9.5,29.0
50%,3.0,6.5,30.0,25.0,18.0,35.0
75%,3.0,12.75,35.0,41.5,22.0,42.0
max,13.0,34.0,44.0,111.0,52.0,111.0


In [78]:
# Count of AQI_Max where it is Unsafe
print(df[(df.AQI_Max > 50)].count())

aqi_co        31
aqi_no2       31
aqi_o3        31
aqi_pm_2_5    31
aqi_pm_10      3
AQI_Max       31
dtype: int64


In [79]:
# Count of AQI_Max where it is Safe
print(df[(df.AQI_Max <= 50)].count())

aqi_co        148
aqi_no2       147
aqi_o3        148
aqi_pm_2_5    148
aqi_pm_10      12
AQI_Max       148
dtype: int64


In [80]:
df.dtypes

aqi_co        float64
aqi_no2       float64
aqi_o3        float64
aqi_pm_2_5    float64
aqi_pm_10     float64
AQI_Max       float64
dtype: object

In [81]:
#Handle missing values
df.isnull().sum()

aqi_co          0
aqi_no2         1
aqi_o3          0
aqi_pm_2_5      0
aqi_pm_10     164
AQI_Max         0
dtype: int64

In [82]:
# Replace missing values with the mean of the column
df_column = ['aqi_co', 'aqi_no2', 'aqi_o3', 'aqi_pm_2_5', 'aqi_pm_10', 'AQI_Max']

for i in df_column:
    a = df[i].mean()
    df[i].replace(np.nan , a,inplace = True)

In [83]:
df.isnull().sum()

aqi_co        0
aqi_no2       0
aqi_o3        0
aqi_pm_2_5    0
aqi_pm_10     0
AQI_Max       0
dtype: int64

In [84]:
# Convert the target column values to safe and unsafe based on their values

for aqi in df['AQI_Max']:
    if (aqi > float(50)):
        df['AQI_Max']=df['AQI_Max'].replace(aqi, "Unsafe")
    else:
        df['AQI_Max']=df['AQI_Max'].replace(aqi, "Safe")

In [85]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
1889,8.0,25.0,34.0,54.0,28.0,Unsafe
1890,6.0,21.0,34.0,39.0,17.933333,Safe
1891,6.0,6.0,33.0,10.0,17.933333,Safe
1892,6.0,8.0,36.0,4.0,17.933333,Safe
1893,6.0,18.0,36.0,15.0,17.933333,Safe


In [86]:
df.dtypes

aqi_co        float64
aqi_no2       float64
aqi_o3        float64
aqi_pm_2_5    float64
aqi_pm_10     float64
AQI_Max        object
dtype: object

In [87]:
columns = ["aqi_co", "aqi_no2", "aqi_o3", "aqi_pm_2_5", "aqi_pm_10"]

target = ["AQI_Max"]

In [88]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
1889,8.0,25.0,34.0,54.0,28.0,Unsafe
1890,6.0,21.0,34.0,39.0,17.933333,Safe
1891,6.0,6.0,33.0,10.0,17.933333,Safe
1892,6.0,8.0,36.0,4.0,17.933333,Safe
1893,6.0,18.0,36.0,15.0,17.933333,Safe


## Split the Data into Training and Testing

In [89]:
# Create our features
X = df.drop(columns='AQI_Max')

#clean features
X = pd.get_dummies(X)

# Create our target
y = df.loc[:, target].copy()
X.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
1889,8.0,25.0,34.0,54.0,28.0
1890,6.0,21.0,34.0,39.0,17.933333
1891,6.0,6.0,33.0,10.0,17.933333
1892,6.0,8.0,36.0,4.0,17.933333
1893,6.0,18.0,36.0,15.0,17.933333


In [90]:
X.describe()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
count,179.0,179.0,179.0,179.0,179.0
mean,3.111732,9.505618,29.849162,30.513966,17.933333
std,1.695608,7.615218,6.097028,18.419383,3.325872
min,1.0,1.0,14.0,4.0,4.0
25%,2.0,4.0,25.5,18.0,17.933333
50%,3.0,7.0,30.0,25.0,17.933333
75%,3.0,12.5,35.0,41.5,17.933333
max,13.0,34.0,44.0,111.0,52.0


In [91]:
# Check the balance of our target values
y["AQI_Max"].value_counts()

##We have imbalanced data.  Have significantly more observations in the class Safe.  Therefore we apply resampling
#techniques below

Safe      148
Unsafe     31
Name: AQI_Max, dtype: int64

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

## Resampling


### Oversampling

#### Naive Random Oversampling

In [93]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'AQI_Max': 1})

In [94]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [95]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9714285714285714


In [96]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33,2
Actual 1,0,10


In [97]:
cm_df = pd.DataFrame(confusion_matrix(y_test, y_pred),
     index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [98]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.94      1.00      0.97      0.97      0.94        35
     Unsafe       0.83      1.00      0.94      0.91      0.97      0.95        10

avg / total       0.96      0.96      0.99      0.96      0.97      0.94        45



#### SMOTE Oversampling

In [99]:
# Resample the training data with SMOTE

from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,sampling_strategy='auto').\
                            fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [100]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [101]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9214285714285715


In [102]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33,2
Actual 1,1,9


In [103]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       0.97      0.94      0.90      0.96      0.92      0.85        35
     Unsafe       0.82      0.90      0.94      0.86      0.92      0.84        10

avg / total       0.94      0.93      0.91      0.93      0.92      0.85        45



#### Undersampling

In [104]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [105]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [106]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9714285714285714


In [107]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33,2
Actual 1,0,10


In [108]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.94      1.00      0.97      0.97      0.94        35
     Unsafe       0.83      1.00      0.94      0.91      0.97      0.95        10

avg / total       0.96      0.96      0.99      0.96      0.97      0.94        45



#### Combination (Over and Under) Sampling


In [109]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [110]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [111]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9571428571428571


In [112]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,32,3
Actual 1,0,10


In [113]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.91      1.00      0.96      0.96      0.91        35
     Unsafe       0.77      1.00      0.91      0.87      0.96      0.92        10

avg / total       0.95      0.93      0.98      0.94      0.96      0.91        45



## Ensemble Learners

### Balanced Random Forest Classifier

In [114]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [115]:
# Calculated the balanced accuracy score
predictions = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.9714285714285714

In [116]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33,2
Actual 1,0,10


In [117]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.94      1.00      0.97      0.97      0.94        35
     Unsafe       0.83      1.00      0.94      0.91      0.97      0.95        10

avg / total       0.96      0.96      0.99      0.96      0.97      0.94        45



In [118]:
# List the features sorted in descending order by feature importance

# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
#importances


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.7473556737022067, 'aqi_pm_2_5'),
 (0.08974938799724547, 'aqi_co'),
 (0.07882665386221087, 'aqi_o3'),
 (0.0728319700814875, 'aqi_no2'),
 (0.011236314356849556, 'aqi_pm_10')]

### Easy Ensemble AdaBoost Classifier

In [119]:
# Train the EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fitting the model
ee_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [120]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.9714285714285714


In [121]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,33,2
Actual 1,0,10


In [122]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      0.94      1.00      0.97      0.97      0.94        35
     Unsafe       0.83      1.00      0.94      0.91      0.97      0.95        10

avg / total       0.96      0.96      0.99      0.96      0.97      0.94        45



# Models Performance Summary

In [125]:
Summary_AQI_Max_Prediction_Safe = pd.DataFrame(
    {"Naive Random Oversampling":{'AccuracyScore': '97%', 'Precision': 1, "Recall":0.94,"F1-Score":0.97},
      "SMOTE Oversampling":{'AccuracyScore': '92%', 'Precision': 0.97, "Recall":0.94,"F1-Score":0.96},
     "ClusterCentroids":{'AccuracyScore': '97%', 'Precision': 1, "Recall":0.94,"F1-Score":0.97},
     "SMOTEEN":{'AccuracyScore': '96%', 'Precision': 1, "Recall":0.91,"F1-Score":0.96},
     "Random Forest Classifier":{'AccuracyScore': '97%', 'Precision': 1, "Recall":0.94,"F1-Score":0.97},
     "Easy Ensemble Classifier -AdaBoost":{'AccuracyScore': '97%', 'Precision': 1, "Recall":0.94,"F1-Score":0.97}
     })
    
Summary_AQI_Max_Prediction_Safe

Unnamed: 0,Naive Random Oversampling,SMOTE Oversampling,ClusterCentroids,SMOTEEN,Random Forest Classifier,Easy Ensemble Classifier -AdaBoost
AccuracyScore,97%,92%,97%,96%,97%,97%
Precision,1,0.97,1,1,1,1
Recall,0.94,0.94,0.94,0.91,0.94,0.94
F1-Score,0.97,0.96,0.97,0.96,0.97,0.97


In [126]:
Summary_AQI_Max_Prediction_Unsafe = pd.DataFrame(
    {"Naive Random Oversampling":{'AccuracyScore': '97%', 'Precision': 0.83, "Recall":1,"F1-Score":0.91},
      "SMOTE Oversampling":{'AccuracyScore': '92%', 'Precision': 0.82, "Recall":0.90,"F1-Score":0.93},
     "ClusterCentroids":{'AccuracyScore': '97%', 'Precision': 0.83, "Recall":1,"F1-Score":0.91},
     "SMOTEEN":{'AccuracyScore': '96%', 'Precision': 0.77, "Recall":1,"F1-Score":0.87},
     "Random Forest Classifier":{'AccuracyScore': '97%', 'Precision': 0.83, "Recall":1,"F1-Score":0.91},
     "Easy Ensemble Classifier -AdaBoost":{'AccuracyScore': '97%', 'Precision': 0.83, "Recall":1,"F1-Score":0.91}
     })
    
Summary_AQI_Max_Prediction_Unsafe

Unnamed: 0,Naive Random Oversampling,SMOTE Oversampling,ClusterCentroids,SMOTEEN,Random Forest Classifier,Easy Ensemble Classifier -AdaBoost
AccuracyScore,97%,92%,97%,96%,97%,97%
Precision,0.83,0.82,0.83,0.77,0.83,0.83
Recall,1,0.9,1,1,1,1
F1-Score,0.91,0.93,0.91,0.87,0.91,0.91
