In [1]:
import sys
sys.path.insert(1, './imports')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from exploratory_analysis import *
from preprocessing import *
from model_selection import *
from model_end_to_end import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
pd.options.display.max_rows = 30

# X $\in$ [1, 5]

## Classifier

In [2]:
fires_jun_15 = pd.read_csv("fires_jun_dec_xy15.csv")
drop_cols = ['Unnamed: 0', 'area_bool']
X = fires_jun_15.drop(drop_cols, axis=1)
y = fires_jun_15.loc[:,"area_bool"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3558)
X_test_with_area = X_test.copy()
X_test = X_test.drop("area", axis=1)
X_with_area = X.copy()
X = X.drop("area", axis=1)
X_train_with_area = X_train.copy()
X_train = X_train.drop("area", axis=1)
filter_15 = y==1

In [3]:
areas_burnt_kernels, test_accuracies_kernels = select_kernel_kcross(X, X_with_area, y, repeats=3)

linear
rbf
poly


In [4]:
areas_burnt_gammas, test_accuracies_gamma = select_gamma_kcross(X, X_with_area, y, -10, 3)

In [5]:
areas_burnt_gammas_refinement, test_accuracies_gammas_refinement = select_gamma_kcross(X, X_with_area, y, 1, 3)

local minima on area burnt around gamma = 50

In [6]:
areas_burnt_c, test_accuracies_c = select_c_kcross(X, X_with_area, y, gamma = 50)

Almost all entries classified as 0, but low associated number of false negatives. Lots of redundant values of C

In [7]:
rfc_jun_15 = SVC(kernel='rbf', gamma = 50, C = 10)
rfc_jun_15.fit(X, y)

SVC(C=10, gamma=50)

In [8]:
with open("classification_jun_dec_15.pickle", 'wb') as f:
    pickle.dump(rfc_jun_15, f)

Searching the model space for the highest accuracy:

In [9]:
result = SVM_hyperpar_skopt(X, y, kernels=['linear', 'poly', 'rbf', 'sigmoid'], max_degree = 3)



In [10]:
print("accuracy of ", 1 - result.fun, "with associated parameters ", result.x)

accuracy of  0.6114 with associated parameters  [8.41850316809234, 'linear', 3, 0.04349773501643578]


In [11]:
rfc_jun_15_skopt = SVC(C = result.x[0], kernel=result.x[1],degree = result.x[2], gamma = result.x[3])
rfc_jun_15_skopt.fit(X, y)

SVC(C=8.41850316809234, gamma=0.04349773501643578, kernel='linear')

In [12]:
with open("classification_jun_dec_15_skopt.pickle", 'wb') as f:
    pickle.dump(rfc_jun_15_skopt, f)

## Regressor

In [13]:
X = X[filter_15]
y = fires_jun_15.loc[:,"area"][filter_15]
losses_lasso_df1 = hyperpar_grid_lasso(X, y, degrees = [1, 2, 3])
losses_ridge_df1 = hyperpar_grid_ridge(X, y, degrees = [1, 2, 3])
gbr_dict1 = hyper_opt_gbr(X, y)

In [14]:
print(gbr_dict1)

{'loss': 1180.7747924603523, 'deg': 1, 'lr': 0.46249999999999997, 'n_estimators': 20}


## X $\in$ [6, 9]

In [15]:
fires_jun_69 = pd.read_csv("fires_jun_dec_xy69.csv")
drop_cols = ['Unnamed: 0', 'area_bool']
X = fires_jun_69.drop(drop_cols, axis=1)
y = fires_jun_69.loc[:,"area_bool"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=3558)
X_test_with_area = X_test.copy()
X_test = X_test.drop("area", axis=1)
X_with_area = X.copy()
X = X.drop("area", axis=1)
X_train_with_area = X_train.copy()
X_train = X_train.drop("area", axis=1)
filter_69 = y==1

In [16]:
areas_burnt_kernels, test_accuracies_kernels = select_kernel_kcross(X, X_with_area, y, repeats=3)

linear
rbf
poly


In [17]:
areas_burnt_gammas, test_accuracies_gamma = select_gamma_kcross(X, X_with_area, y, min_g=-6, max_g = 4)

In [18]:
areas_burnt_c, test_accuracies_c = select_c_kcross(X, X_with_area, y, gamma=90)

In [19]:
rfc_jun_69 = SVC(C=10.326154432909957, gamma=100.0, kernel='linear')
rfc_jun_69.fit(X_train, y_train)
with open("classification_jun_dec_69.pickle", 'wb') as f:
    pickle.dump(rfc_jun_69, f)

### Accuracy optimization using scikit optimize

In [20]:
result = SVM_hyperpar_skopt(X, y, kernels=['linear', 'poly', 'rbf', 'sigmoid'], max_degree = 3)
print("accuracy of ", 1 - result.fun, "with associated parameters ", result.x)

accuracy of  0.5988888888888888 with associated parameters  [0.26869573356106563, 'poly', 2, 0.9600363819999834]


In [21]:
rfc_jun_69_skopt = SVC(C = result.x[0], kernel=result.x[1],degree = result.x[2], gamma = result.x[3])
rfc_jun_69_skopt.fit(X, y)

SVC(C=0.26869573356106563, degree=2, gamma=0.9600363819999834, kernel='poly')

In [22]:
with open("classification_jun_dec_69_skopt.pickle", 'wb') as f:
    pickle.dump(rfc_jun_69_skopt, f)

### Regression

In [23]:
y = fires_jun_69.loc[:,"area"]
losses_lasso_df2 = hyperpar_grid_lasso(X, y, degrees = [1, 2, 3])
losses_ridge_df2 = hyperpar_grid_ridge(X, y, degrees = [1, 2, 3])
gbr_dict2 = hyper_opt_gbr(X, y)

In [24]:
gbr_dict2

{'loss': 10799.217352201546, 'deg': 3, 'lr': 0.6, 'n_estimators': 20}