<a href="https://colab.research.google.com/github/alonsocampana/fire-montesinho/blob/main/Custom_SVM_Cutoff_squared_hinge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import KFold

fires = pd.read_csv("fire_data_processed_after_pca.csv")
X = fires
cut = 1
y = np.exp(X.loc[:,"area"]) - 1
X = X.drop(['Unnamed: 0', "area", "area_bool"], axis=1)
feat_drop = ['apr_bool', '66', '73', '65', '74', '64', '75', '95', '63', '99', '84', '55', '13', 'aug_bool', '76', '43', '44', '85', 'rain', '94', 'oct_bool', '45', '33', 'nov_bool', 'feb_bool', '23',
 'wed', 'mar_bool', 'fri', 'jun_bool', '14', 'tue', '24', '88', '46', 'sat']

features_added = [["22","sat"], ["wed","pc5"], ["tue","pc2"], ["94", "pc6"], ['sep_bool','pc4']]
  

def add_feature(df, col1, col2):
    df = df.copy()
    new_name = col1 + " * " + col2
    map_names = {"col_name":new_name}
    return (df.assign(col_name = df.loc[:,col1] * df.loc[:,col2])).rename(map_names, axis=1)

def l2(theta):
    return theta@theta.T

def exp_loss(weights, X, y):
    return sum(np.exp(-y*(X@weights.T)))

def hinge_loss(weights, X, y):
    loss = (-(weights@X.T)+1) * np.sign(y) * (y**2)
    loss[loss<0] = 0
    return np.sum(loss)

def objective(X, y, weights, lambd =0.5, loss = exp_loss, regularizer=l2):
    return loss(weights, X, y) + lambd*regularizer(weights)

for feat in features_added:
    X = add_feature(X, feat[0], feat[1])
X = X.drop(feat_drop, axis=1)

regs = np.linspace(0.001, 3, 5)
cutoffs = np.linspace(0.001, 2, 5)
rskf = KFold(n_splits=4, shuffle=True,random_state=3558)
dic = {}
n = 0
for r in regs:
    for c in cutoffs:
        n += 1
        if n > 30:
          break
        success = 0
        missclass = 0
        burnt = 0
        cum_par = np.zeros([1, X.shape[1]])
        i = 0
        splits = []
        y_temp = y - c
        for train_index, test_index in rskf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y_temp.iloc[train_index], y_temp.iloc[test_index]
            x0 = np.random.rand(1, X_train.shape[1])
            res = minimize(lambda weights: objective(X_train, y_train, weights, loss=hinge_loss), x0 = x0, method = 'Nelder-Mead')
            if res.success:
                print("success!")
                splits.append(i)
                success += 1
                pred = ((1-(X_test @ res.x)) < 0)
                burnt += y[test_index][pred]
                missclass += sum(pred* y_test <0)
                cum_par += res.x
            i+= 1
        temp_str = "R: " + str(r) + " Cut: " + str(c)
        if success != 0:
          dic[temp_str] = {"missclassified": missclass/success, "parameters":cum_par/success, "successfull iterations":success, "successful_splits":splits, "area_burnt":burnt/success}
        else:
          dic[temp_str] = {"missclassified": 0, "parameters":0, "successfull iterations":0, "successful_splits":[], "area_burnt":0}

success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!
success!


In [13]:
missclass

58

In [15]:
len(test_index)

129

In [19]:
pred.to_numpy()

array([ True,  True,  True, False,  True,  True, False, False, False,
        True, False,  True,  True,  True,  True, False, False, False,
        True,  True,  True,  True, False, False,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [18]:
y[test_index].to_numpy()

array([  0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   0.  ,   0.71,   1.19,   1.56,   1.61,   2.51,
         2.53,   2.57,   4.61,   4.88,   6.38,   6.96,   7.3 ,   8.31,
         8.71,  13.7 ,  14.57,  31.72,  32.07,  36.85,  37.02,  71.3 ,
       105.66, 154.88,   0.76,   1.52,  10.34,   8.02,   0.68,   3.3 ,
         0.79,   4.4 ,   9.27,   8.98,  11.19,   5.38,  10.73,   1.1 ,
         0.  ,   8.  ,   2.64,   0.  ,   3.52,   0.  ,   0.41,   0.  ,
         0.  ,  14.29,   0.  ,   0.  ,   1.58,   2.18,   0.  ,   0.  ,
         2.13,   1.47,   2.18,   3.94,   2.93,   0.  ,   0.  ,   0.  ,
         0.  ,   0.  , 174.63,  42.87,  28.74,   9.96,  51.78,   4.95,
         0.  ,   0.  ,   0.  ,   1.63,   3.05,   0.72,   0.  ,   3.2 ,
         0.  ,   0.  ,   0.  ,   0.54,   6.43,   3.35,   0.  ,   0.  ,
      

In [56]:
dic

{'R: 0.001 Cut: 0.001': {'area_burnt': 0,
  'missclassified': 0,
  'parameters': 0,
  'successful_splits': [],
  'successfull iterations': 0},
 'R: 0.001 Cut: 0.50075': {'area_burnt': 0,
  'missclassified': 0,
  'parameters': 0,
  'successful_splits': [],
  'successfull iterations': 0},
 'R: 0.001 Cut: 1.0005': {'area_burnt': 0,
  'missclassified': 0,
  'parameters': 0,
  'successful_splits': [],
  'successfull iterations': 0},
 'R: 0.001 Cut: 1.5002499999999999': {'area_burnt': 0,
  'missclassified': 0,
  'parameters': 0,
  'successful_splits': [],
  'successfull iterations': 0},
 'R: 0.001 Cut: 2.0': {'area_burnt': 0     NaN
  3     NaN
  21    NaN
  24    NaN
  30    NaN
         ..
  493   NaN
  495   NaN
  504   NaN
  507   NaN
  513   NaN
  Name: area, Length: 109, dtype: float64,
  'missclassified': 62.5,
  'parameters': array([[ 1.04419135,  0.12465796,  1.00816547,  0.95247006,  0.16845528,
           0.12170256,  0.76791772,  1.37811817,  0.83609231, -0.06817323,
           0

In [108]:
w1 = [-0.87862448,  1.97404325, -0.13497517, -0.02130116,  0.5117702 ,
           0.25108615,  0.53530561,  2.55351245, -0.22633704, -0.27658311,
          -0.8094667 , -0.69106749, -0.39751548, -0.04551013,  0.14003454,
           0.8674327 , -0.84412217,  0.11748659,  2.13460362, -0.04497207,
           0.948089  ,  0.1993878 , -0.13037691,  1.08104865,  0.81237291,
           0.17284016, -0.22081886,  0.19071623, -0.04131854, -0.44709271,
          -0.21776269,  2.1094772 , -0.321198]
for train_index, test_index in rskf.split(X, y):
  pred = ((1-(X.iloc[test_index] @ w1)) < 0)
  index = (pred*y.iloc[test_index] > 0)
  index2 = (pred*y.iloc[test_index] <= 0)
  print(sum(y.iloc[test_index][index]))
  print(sum(y.iloc[test_index][index2]))
  print(sum(index))
  print(sum(index2))

2332.55
0.95
58
71
1530.4900000000005
0.0
71
58
1125.2299999999996
19.350000000000005
71
58
1633.4800000000005
0.0
65
64


In [106]:
w2 = [0.09425426,  0.15358488,  0.14300342,  1.94322951,  0.11605437,
           0.80226791,  0.10738175,  0.07029594,  0.14531806,  0.39690908,
           0.0547255 ,  1.11408851,  0.70218894,  0.12827347,  0.45126381,
           0.25693549,  0.59348093,  1.18210427,  0.23155322,  0.53361798,
           0.4951322 ,  0.09833719,  0.12499564, -0.24962275,  1.28750076,
           0.02766565,  0.11580788, -0.05866233,  0.38728217, -0.60101379,
           0.48830209,  0.34225204,  0.08600402]
for train_index, test_index in rskf.split(X, y):
  pred = ((1-(X.iloc[test_index] @ w2)) < 0)
  index = (pred*y.iloc[test_index] > 0)
  index2 = (pred*y.iloc[test_index] <= 0)
  print(sum(y.iloc[test_index][index]))
  print(sum(y.iloc[test_index][index2]))
  print(sum(index))
  print(sum(index2))

2314.89
18.610000000000007
56
73
1376.33
154.16
65
64
1110.2399999999993
34.34000000000002
69
60
1628.8600000000004
4.62
64
65


In [105]:
for train_index, test_index in rskf.split(X, y):
  pred = ((1-(X.iloc[test_index] @ ((np.array(w1)+np.array(w2))/2))) < 0)
  index = (pred*y.iloc[test_index] > 0)
  index2 = (pred*y.iloc[test_index] <= 0)
  print(sum(y.iloc[test_index][index]))
  print(sum(y.iloc[test_index][index2]))
  print(sum(index))
  print(sum(index2))

2314.89
18.610000000000007
56
73
1437.46
93.02999999999997
66
63
1138.8099999999995
5.7700000000000005
72
57
1628.8600000000004
4.62
64
65
