## Vanilla Classifcation on Full Dataset
This notebook contains first vanilla run of several classification models on the full dataset. 

In [27]:
# import libraries
import pandas as pd
import xarray as xr
import numpy as np
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
import sklearn.metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from yellowbrick.classifier import ConfusionMatrix

### Data Preprocessing

In [None]:
# import data and convert to pandas dataframe
data = xr.open_mfdataset('final_merge.nc')
data = data.to_dataframe()
data.reset_index(inplace = True)

In [None]:
# check class imbalance
data['label'].value_counts()

In [None]:
# Split data into X and y
# Save coordinates for merging at end or for feature engineering later
coord = data.loc[:,['lat', 'lon']]
X = data.drop(['label', 'lat', 'lon'], axis = 1)
y = data['label']

In [None]:
# Train, Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [None]:
logistic = Pipeline(steps = [('scale', StandardScaler()), ('lr', LogisticRegression(max_iter = 1000))])

In [None]:
log_param_grid = {
    'logistic__C': np.logspace(-4, 4, 4),
}

In [None]:
# Standard Scale
# Only fit on training data
# Transform both
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

In [None]:
# one-hot-encode y ? 

### Multiclass Logisitic Regression

In [None]:
lg = LogisticRegression(max_iter = 10000)
lg.fit(x_train, y_train)
y_pred_train = lg.predict(x_train)

In [None]:
# for some reason, balanced_accuracy_score is not importing
accuracy_score(y_train, y_pred_train)

In [None]:
y_pred_test = lg.predict(x_test)
accuracy_score(y_test, y_pred_test)

In [None]:
cm_train = ConfusionMatrix(lg, classes =['fwi_low', 'fwi_high', 'fwi_veryhigh', 'fwi_moderate', 'fwi_extreme'])
cm_train.score(x_test, y_test)
cm_train.show()