## Vanilla Classifcation on Full Dataset
This notebook contains first vanilla run of several classification models on the full dataset. 

In [1]:
# import libraries
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import sklearn.metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

from yellowbrick.classifier import ConfusionMatrix



### Data Preprocessing

In [2]:
# import data and convert to pandas dataframe
dataset_v1 = pd.read_parquet('dataset_v1.parquet.gzip', engine = 'pyarrow')

In [3]:
!ls -lh dataset_v1.parquet.gzip

-rw-r--r-- 1 jupyter jupyter 220M Jan 17 18:14 dataset_v1.parquet.gzip


In [4]:
# check if read correctly
dataset_v1.head()

Unnamed: 0,lat,lon,MaxFRP,et_500m,_1_km_16_days_EVI,_1_km_16_days_EVI2,_1_km_16_days_NDVI,_1_km_16_days_NIR_reflectance,_1_km_16_days_SWIR1_reflectance,_1_km_16_days_SWIR2_reflectance,...,_1_km_16_days_blue_reflectance,_1_km_16_days_green_reflectance,_1_km_16_days_red_reflectance,_DC,_DMC,_FFMC,_ISI,_BUI,_DSR,label
0,4.3625,96.329167,0.0,3276.5,0.611,0.6071,0.846,0.3885,0.349,0.1735,...,0.0168,0.0518,0.0324,6.153957,1.339365,43.675632,0.137625,1.721346,0.000183,fwi_low
1,4.3625,96.3375,0.0,3276.5,0.5452,0.5392,0.8948,0.3081,0.2868,0.135,...,0.0102,0.0379,0.0171,6.153957,1.339365,43.675632,0.137625,1.721346,0.000183,fwi_low
2,4.3625,96.345833,0.0,3276.5,0.4867,0.4878,0.6774,0.3734,0.3358,0.1938,...,0.034,0.0663,0.0718,6.153957,1.339365,43.675632,0.137625,1.721346,0.000183,fwi_low
3,4.3625,96.354167,0.0,33.700001,0.6779,0.6552,0.9191,0.3916,0.382,0.1855,...,0.0143,0.0471,0.0165,6.153957,1.339365,43.675632,0.137625,1.721346,0.000183,fwi_low
4,4.3625,96.3625,0.0,27.0,0.6779,0.6552,0.9191,0.3916,0.382,0.1855,...,0.0143,0.0471,0.0165,6.153957,1.339365,43.675632,0.137625,1.721346,0.000183,fwi_low


In [20]:
# check class imbalance
dataset_v1['label'].value_counts()

water           30991646
fwi_extreme      9308511
burned           7813581
fwi_low          3031671
fwi_veryhigh      610905
fwi_high          410397
land              351369
fwi_moderate      211015
active_fire         1353
Name: label, dtype: int64

In [10]:
col = ['MaxFRP', 'et_500m', '_1_km_16_days_EVI',
       '_1_km_16_days_EVI2', '_1_km_16_days_NDVI',
       '_1_km_16_days_NIR_reflectance', '_1_km_16_days_SWIR1_reflectance',
       '_1_km_16_days_SWIR2_reflectance', '_1_km_16_days_SWIR3_reflectance',
       '_1_km_16_days_blue_reflectance', '_1_km_16_days_green_reflectance',
       '_1_km_16_days_red_reflectance', '_DC', '_DMC', '_FFMC', '_ISI', '_BUI',
       '_DSR']

In [11]:
# Split data into X and y
# Need to save coordinates AFTER train test split in order to merge at end

X = dataset_v1[col]
y = dataset_v1['label']

# Predicting categories: ['fwi_low', 'burned', 'water', 'land', 'active_fire', 'fwi_moderate', 'fwi_high', 'fwi_veryhigh', 'fwi_extreme']

In [23]:
# Train, Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify = y)

In [27]:
# Preprocessing steps

# Scale Data
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

# Binarizing is part of OneVsRestClassifier implementation

### Multiclass Logisitic Regression

In [28]:
lr = LogisticRegression(penalty = 'l1', solver = 'saga', n_jobs = -1)

In [29]:
trained_model = OneVsRestClassifier(lr).fit(x_train, y_train)

OSError: [Errno 12] Cannot allocate memory

In [None]:
# Save trained model
dump(trained_model, 'lg_v1_ovr.joblib')

In [None]:
y_pred_train = lg.predict(x_train)

In [None]:
# for some reason, balanced_accuracy_score is not importing
accuracy_score(y_train, y_pred_train)

In [None]:
y_pred_test = lg.predict(x_test)
accuracy_score(y_test, y_pred_test)

In [None]:
cm_train = ConfusionMatrix(lg, classes =['fwi_low', 'fwi_high', 'fwi_veryhigh', 'fwi_moderate', 'fwi_extreme'])
cm_train.score(x_test, y_test)
cm_train.show()

### Check Trained Model

In [12]:
lg_v1 = load('lg_v1.joblib') 

In [13]:
y_pred_all = lg_v1.predict(X)

In [15]:
dataset_v1_pred = dataset_v1.loc[:,['lat', 'lon']]

In [16]:
dataset_v1_pred['y_pred'] = y_pred_all

In [19]:
dataset_v1_pred['y_pred'].value_counts()

water           35031498
fwi_extreme      9627363
burned           7620688
fwi_low           288418
fwi_veryhigh      162481
Name: y_pred, dtype: int64

In [None]:
dataset_v1_pred['map'] = 0
dataset_v1_pred.loc[(dataset_v1_pred['y_pred']=='water'), 'map'] = 0
dataset_v1_pred.loc[(dataset_v1_pred['y_pred']=='fwi_extreme'), 'map'] = 4
dataset_v1_pred.loc[(dataset_v1_pred['y_pred']=='burned'), 'map'] = 2
dataset_v1_pred.loc[(dataset_v1_pred['y_pred']=='fwi_low'), 'map'] = 1
dataset_v1_pred.loc[(dataset_v1_pred['y_pred']=='fwi_veryhigh'), 'map'] = 3

In [None]:
dataset_v1_pred.to_netcdf4('')