## Scenario 2: 08/09/2020 - 10/06/2020 
## 15% Reduction in Human Activity - Minimal Risk Tier Level

## Import Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder

from sqlalchemy import create_engine
import psycopg2

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier

## Read the Data from Postgres

In [4]:
pip install mysql-connector-python

Note: you may need to restart the kernel to use updated packages.


In [5]:
#Create a connection to the Postgres & AWS

# Import API key
from config import db_password

connection = psycopg2.connect(
    host = 'finalproject-ml.cvm9etk63tz8.us-west-1.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = db_password,
    database = 'postgres'
    )
cursor=connection.cursor()

In [6]:
#Read in Table for co_2015
aqi_all = "select * from aqi_all"

In [7]:
#Check DataFrame
aqi_all = pd.read_sql(aqi_all, con=connection)
aqi_all.head()

Unnamed: 0,Date,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
0,2015-01-01,7.0,37.0,26.0,55.0,
1,2015-01-02,8.0,42.0,20.0,73.0,
2,2015-01-03,10.0,45.0,17.0,84.0,
3,2015-01-04,10.0,40.0,19.0,86.0,
4,2015-01-05,9.0,42.0,12.0,69.0,


## Prepare Data

In [8]:
# Convert the Date column to type DATETIME
aqi_all['Date'] = pd.to_datetime(aqi_all['Date'])

In [9]:
# Filter Dates to create scenario #2
start_dateE2 = "2020-8-09"
end_dateE2 = "2020-10-06"

after_start_dateE2 = aqi_all["Date"] >= start_dateE2
before_end_dateE2 = aqi_all["Date"] <= end_dateE2
between_two_datesE2 = after_start_dateE2 & before_end_dateE2
df = aqi_all.loc[between_two_datesE2]
df.shape

(59, 6)

In [10]:
df.tail()

Unnamed: 0,Date,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
2101,2020-10-02,11.0,32.0,33.0,126.0,
2102,2020-10-03,7.0,18.0,30.0,55.0,
2103,2020-10-04,3.0,5.0,27.0,34.0,
2104,2020-10-05,6.0,17.0,25.0,36.0,
2105,2020-10-06,6.0,13.0,32.0,54.0,19.0


In [11]:
# Drop Column Date
df = df.drop(columns=["Date"])

In [12]:
# Create AQI_Max Column
df["AQI_Max"] = pd.DataFrame(df.max(axis=1))

In [13]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
2047,2.0,3.0,23.0,25.0,,25.0
2048,2.0,6.0,25.0,29.0,,29.0
2049,2.0,4.0,25.0,18.0,,25.0
2050,2.0,6.0,23.0,18.0,,23.0
2051,5.0,18.0,31.0,53.0,,53.0


In [14]:
df.shape

(59, 6)

In [15]:
df.shape

(59, 6)

In [16]:
# Summary statistics
df.describe()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
count,59.0,59.0,59.0,59.0,5.0,59.0
mean,6.288136,12.830508,28.423729,64.779661,37.0,66.016949
std,3.952524,8.979195,7.513673,45.317679,25.612497,44.11251
min,2.0,3.0,10.0,7.0,12.0,10.0
25%,3.0,6.0,23.5,34.5,19.0,35.5
50%,6.0,9.0,27.0,55.0,28.0,55.0
75%,7.0,16.5,32.0,75.0,52.0,75.0
max,18.0,37.0,51.0,198.0,74.0,198.0


In [17]:
# Count of AQI_Max where it is Unsafe
print(df[(df.AQI_Max > 50)].count())

aqi_co        33
aqi_no2       33
aqi_o3        33
aqi_pm_2_5    33
aqi_pm_10      4
AQI_Max       33
dtype: int64


In [18]:
# Count of AQI_Max where it is Safe
print(df[(df.AQI_Max <= 50)].count())

aqi_co        26
aqi_no2       26
aqi_o3        26
aqi_pm_2_5    26
aqi_pm_10      1
AQI_Max       26
dtype: int64


In [19]:
df.dtypes

aqi_co        float64
aqi_no2       float64
aqi_o3        float64
aqi_pm_2_5    float64
aqi_pm_10     float64
AQI_Max       float64
dtype: object

In [20]:
#Handle missing values
df.isnull().sum()

aqi_co         0
aqi_no2        0
aqi_o3         0
aqi_pm_2_5     0
aqi_pm_10     54
AQI_Max        0
dtype: int64

In [21]:
# Replace missing values with the mean of the column
df_column = ['aqi_co', 'aqi_no2', 'aqi_o3', 'aqi_pm_2_5', 'aqi_pm_10', 'AQI_Max']

for i in df_column:
    a = df[i].mean()
    df[i].replace(np.nan , a,inplace = True)

In [22]:
df.isnull().sum()

aqi_co        0
aqi_no2       0
aqi_o3        0
aqi_pm_2_5    0
aqi_pm_10     0
AQI_Max       0
dtype: int64

In [23]:
# Convert the target column values to safe and unsafe based on their values

for aqi in df['AQI_Max']:
    if (aqi > float(50)):
        df['AQI_Max']=df['AQI_Max'].replace(aqi, "Unsafe")
    else:
        df['AQI_Max']=df['AQI_Max'].replace(aqi, "Safe")

In [24]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
2047,2.0,3.0,23.0,25.0,37.0,Safe
2048,2.0,6.0,25.0,29.0,37.0,Safe
2049,2.0,4.0,25.0,18.0,37.0,Safe
2050,2.0,6.0,23.0,18.0,37.0,Safe
2051,5.0,18.0,31.0,53.0,37.0,Unsafe


In [25]:
df.dtypes

aqi_co        float64
aqi_no2       float64
aqi_o3        float64
aqi_pm_2_5    float64
aqi_pm_10     float64
AQI_Max        object
dtype: object

In [26]:
columns = ["aqi_co", "aqi_no2", "aqi_o3", "aqi_pm_2_5", "aqi_pm_10"]

target = ["AQI_Max"]

In [27]:
df.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10,AQI_Max
2047,2.0,3.0,23.0,25.0,37.0,Safe
2048,2.0,6.0,25.0,29.0,37.0,Safe
2049,2.0,4.0,25.0,18.0,37.0,Safe
2050,2.0,6.0,23.0,18.0,37.0,Safe
2051,5.0,18.0,31.0,53.0,37.0,Unsafe


## Split the Data into Training and Testing

In [28]:
# Create our features
X = df.drop(columns='AQI_Max')

#clean features
X = pd.get_dummies(X)

# Create our target
y = df.loc[:, target].copy()
X.head()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
2047,2.0,3.0,23.0,25.0,37.0
2048,2.0,6.0,25.0,29.0,37.0
2049,2.0,4.0,25.0,18.0,37.0
2050,2.0,6.0,23.0,18.0,37.0
2051,5.0,18.0,31.0,53.0,37.0


In [29]:
X.describe()

Unnamed: 0,aqi_co,aqi_no2,aqi_o3,aqi_pm_2_5,aqi_pm_10
count,59.0,59.0,59.0,59.0,59.0
mean,6.288136,12.830508,28.423729,64.779661,37.0
std,3.952524,8.979195,7.513673,45.317679,6.726171
min,2.0,3.0,10.0,7.0,12.0
25%,3.0,6.0,23.5,34.5,37.0
50%,6.0,9.0,27.0,55.0,37.0
75%,7.0,16.5,32.0,75.0,37.0
max,18.0,37.0,51.0,198.0,74.0


In [30]:
# Check the balance of our target values
y["AQI_Max"].value_counts()

##We have imbalanced data.  Have significantly more observations in the class Safe.  Therefore we apply resampling
#techniques below

Unsafe    33
Safe      26
Name: AQI_Max, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

## Resampling


### Oversampling

#### Naive Random Oversampling

In [32]:
# Resample the training data with the RandomOversampler

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'AQI_Max': 1})

In [33]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [34]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

1.0


In [35]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,0
Actual 1,0,7


In [36]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00         8
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        15



#### SMOTE Oversampling

In [37]:
# Resample the training data with SMOTE

from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,sampling_strategy='auto').\
                            fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [38]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [39]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

1.0


In [40]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,0
Actual 1,0,7


In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00         8
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        15



#### Undersampling

In [42]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [43]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [44]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

1.0


In [45]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,0
Actual 1,0,7


In [46]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00         8
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        15



#### Combination (Over and Under) Sampling


In [47]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete

from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'AQI_Max': 1})

In [48]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [49]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

1.0


In [50]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,0
Actual 1,0,7


In [51]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00         8
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        15



## Ensemble Learners

### Balanced Random Forest Classifier

In [52]:
# Resample the training data with the BalancedRandomForestClassifier
# Create a random forest classifier.
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

# Fitting the model
rf_model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [53]:
# Calculated the balanced accuracy score
predictions = rf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

1.0

In [54]:
# Display the confusion matrix

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,0
Actual 1,0,7


In [55]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00         8
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        15



In [56]:
# List the features sorted in descending order by feature importance

# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
#importances


# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.6117439140227342, 'aqi_pm_2_5'),
 (0.22714804501082247, 'aqi_co'),
 (0.1091280073179922, 'aqi_o3'),
 (0.0496980844315175, 'aqi_no2'),
 (0.0022819492169337363, 'aqi_pm_10')]

### Easy Ensemble AdaBoost Classifier

In [57]:
# Train the EasyEnsembleClassifier
ee_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fitting the model
ee_model.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [58]:
# Calculated the balanced accuracy score
y_pred = ee_model.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

1.0


In [59]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8,0
Actual 1,0,7


In [60]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

       Safe       1.00      1.00      1.00      1.00      1.00      1.00         8
     Unsafe       1.00      1.00      1.00      1.00      1.00      1.00         7

avg / total       1.00      1.00      1.00      1.00      1.00      1.00        15



# Models Performance Summary