In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


In [2]:
#connect to database

import pandas as pd

conn_path = "/Users/megsmacbookpro/Desktop/final_project/final_project/Resources/db_conn_info.csv"

conn_info = pd.read_csv(conn_path)

db_name = conn_info['db_name'][0]

db_pw = conn_info['password'][0]

db_host = conn_info['host'][0]

db_port = conn_info['port'][0]

db_user = conn_info['user'][0]

In [3]:
from sqlalchemy import create_engine

engine = create_engine(f'postgresql://{db_name}:{db_pw}@{db_host}:{db_port}/{db_user}')

In [20]:
# import model_dataset
df = pd.read_sql("select * from model_dataset", engine)

df.head()

Unnamed: 0,index,FIPS,recent_trend,PM25_max_pred,PM25_med_pred,PM25_mean_pred,O3_max_pred,O3_med_pred,O3_mean_pred,PM25_max_pred_2001,...,O3_med_pred_2005,O3_med_pred_2006,O3_med_pred_2007,O3_med_pred_2008,O3_med_pred_2009,O3_med_pred_2010,O3_med_pred_2011,O3_med_pred_2012,O3_med_pred_2013,O3_med_pred_2014
0,0,1101,falling,12.555164,12.013325,11.958884,41.874165,39.939759,39.977164,13.365598,...,40.567937,43.333084,43.589498,40.750847,36.326936,41.478252,41.055674,39.470118,36.512861,37.457633
1,1,1103,stable,13.287901,12.68461,12.647047,43.30817,41.90169,41.888594,15.564124,...,43.22668,44.78666,46.47012,42.158683,38.435499,42.927004,42.164915,41.93758,38.47392,38.021493
2,2,1105,stable,11.426605,11.07215,11.079387,39.805057,38.986625,39.022229,12.76843,...,40.612558,43.447473,43.062623,38.970321,34.915892,39.396689,38.999311,37.908802,35.517161,35.654156
3,3,1107,stable,11.594499,11.146401,11.155309,39.979953,39.173487,39.137133,13.19924,...,41.574587,43.677365,42.648266,38.709791,35.148054,39.154417,38.971029,38.468993,36.022568,35.750203
4,4,1109,stable,11.352271,10.939546,10.932512,40.690546,39.856677,39.866958,12.164619,...,40.58995,43.572161,43.102934,40.4639,36.734288,41.022877,40.996091,38.420984,36.382191,37.12724


In [5]:
len(df)

2096

In [6]:
X = df.drop(['index', 'FIPS', 'recent_trend'], axis=1)
y = df['recent_trend']

In [7]:
y.value_counts()

stable     1922
falling     136
rising       38
Name: recent_trend, dtype: int64

In [58]:
#Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=142, stratify=y)
scaler =StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [59]:
# SMOTE to resample training data
X_resampled, y_resampled = SMOTE(
    random_state=42,sampling_strategy='auto').fit_resample(
    X_train_scaled, y_train)

print(f'y_resampled: {Counter(y_resampled)}')


y_resampled: Counter({'stable': 1441, 'falling': 1441, 'rising': 1441})


In [62]:
from imblearn.ensemble import BalancedRandomForestClassifier
brc=BalancedRandomForestClassifier(random_state=142, n_estimators=120).fit(X_resampled, y_resampled)

In [63]:
y_pred= brc.predict(X_test_scaled)
y_pred_train= brc.predict(X_train_scaled)

In [64]:
#balanced accuracy training
balanced_accuracy_score(y_train, y_pred_train)


1.0

In [66]:
# balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.8057061586473351

In [67]:
#confusion matrix
cm=confusion_matrix(y_test, y_pred)
cm

array([[ 30,   0,   4],
       [  0,   5,   4],
       [  8,   2, 471]])

In [68]:
brc_df=pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
brc_df

Unnamed: 0,Prediction,Actual
0,stable,stable
1,stable,stable
2,stable,stable
3,stable,stable
4,stable,stable
...,...,...
519,stable,stable
520,stable,stable
521,stable,stable
522,stable,stable


In [69]:
print(brc_df.Prediction.value_counts())

stable     479
falling     38
rising       7
Name: Prediction, dtype: int64


In [70]:
print(brc_df.Actual.value_counts())

stable     481
falling     34
rising       9
Name: Actual, dtype: int64


In [71]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

    falling       0.79      0.88      0.98      0.83      0.93      0.86        34
     rising       0.71      0.56      1.00      0.63      0.74      0.53         9
     stable       0.98      0.98      0.81      0.98      0.89      0.81       481

avg / total       0.97      0.97      0.83      0.97      0.89      0.81       524

