In [1]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [2]:
TRAIN_DATASET_PATH = 'data/uci_ml_hackathon_fire_dataset_2012-05-09_2013-01-01_10k_train.hdf5'
TEST_DATASET_PATH = 'data/uci_ml_hackathon_fire_dataset_2013-01-01_2014-01-01_5k_test.hdf5'

def getDataDict(DatasetPath):
    with h5py.File(DatasetPath, 'r') as f:
        data = {}
        for k in list(f):
            data[k] = f[k][:]
        return data

train_data = getDataDict(TRAIN_DATASET_PATH)

train_data.keys()

def transformDateTime(datetime):
    ret = np.asarray([x * np.ones((1,30, 30)) for x in datetime])
    return ret


def transformLandCover(landCover):
    nanConvert = {
        0: 0,
        1: 0,
        2: -1,
        3: 0,
        4: -1,
        5: 0,
        6: 0,
        16:0
    }
    ret=[]
    
    for datapoint in landCover:
        for i in range(17):
            if i in nanConvert.keys():
                datapoint[i][np.isnan(datapoint[i])]= nanConvert[i] 
        ret.append(datapoint)
    return np.asarray(ret)


def transformLatAndLong(val):
    ret = np.asarray([x * np.ones((1,30, 30)) for x in val])
    return ret

#TODO : define temperature according to datetime average
def transformMet(met):
    nanConvert = {
        0: 290,
        1: 26,
        2: 0,
        3: 0,
        4: 0,
    }
    met0=[]
    met1=[]
    for datapoint in met:
        for i in range(5):
            datapoint[0][i][np.isnan(datapoint[0][i])]= nanConvert[i] 
            datapoint[1][i][np.isnan(datapoint[1][i])]= nanConvert[i] 
        met0.append(datapoint[0])
        met1.append(datapoint[1])
    return np.asarray(met0),np.asarray(met1)

def transformFire(fire):
    return np.asarray(fire)

#transform all of them into dict of 3d np arrays.
#Augmentation step must take place after this.
#Can store this in h5py file after this.
def transformAndClean(data):
    X = {}
    Y= {}
    X['datetime'] = transformDateTime(data['datetime'])
    X['landCover'] = transformLandCover(data['land_cover'])
    X['latitude'] = transformLatAndLong(data['latitude'])
    X['longitude'] = transformLatAndLong(data['longitude'])
    X['met0'], X['met1'] = transformMet(data['meteorology'])
    X['observed'] = transformFire(data['observed'])
    Y['target'] = transformFire(data['target'])
    return X,Y

X,Y = transformAndClean(train_data)
train_data=None

startDictionary={
    'datetime':0,
    'landCover':1,
    'latitude':18,
    'longitude':19,
    'met0':20,
    'met1':25,
    'observed':30,
    'target':0
}

lengthDictionary={
    'datetime':1,
    'landCover':17,
    'latitude':1,
    'longitude':1,
    'met0':5,
    'met1':5,
    'observed':5,
    'target':2
}
def flattenData(data):
    length =0
    for key,value in data.items():
        length += value.shape[1]
        n=value.shape[0]
    ret = np.zeros((n,length,30,30))
    for key,arr in data.items():     
        for index,datapoint in enumerate(arr):
            ret[index][startDictionary[key]: startDictionary[key]+lengthDictionary[key]][:][:]=datapoint        
    return ret;

flatX = flattenData(X)
flatY = flattenData(Y)  

X=None 
Y=None

trainX,testX,trainY,testY = train_test_split(flatX,flatY,test_size=0.2,random_state = 42,shuffle=True)
flatX=None
flatY=None

In [3]:
#expects 4D input for X, Y
def filterZeroData (X, Y, minFires=30):
    count=0
    for i in range(Y.shape[0]):
        _, counts = np.unique(Y[i], return_counts=True)
        if len(counts)<=1 or counts[1]<=minFires:
            count+=1
            
    newX = np.zeros([X.shape[0] - count, X.shape[1], X.shape[2], X.shape[3]])
    newY = np.zeros([Y.shape[0] - count, Y.shape[1], Y.shape[2], Y.shape[3]])
    
    j=0
    for i in range(Y.shape[0]):
        _, counts = np.unique(Y[i], return_counts=True)
        if len(counts)==2:
            if (counts[1]>minFires):
                newX[j] = X[i]
                newY[j] = Y[i]
                j+=1
    
    return newX, newY

In [4]:
def kernelTransform(X, kernel_dim):
    n_features = X.shape[1]
    pad_width = (kernel_dim-1)//2
    resultX = np.zeros([X.shape[0]*30*30, n_features*kernel_dim*kernel_dim])
    for data in range(0, X.shape[0]):
    #     print(feature_img.shape)
    #     print(feature_img[features.index(9)].shape)
        f_img_pad = np.pad(X[data], ((0,0), (pad_width, pad_width), (pad_width, pad_width)), 'edge')
    #     print(f_img_pad[features.index(9)])
    #     feature_line = np.zeros(n_features*kernel_dim*kernel_dim)
    #     print(features.shape)
        k=0
        for i in range(30):
            for j in range(30):
                resultX[data*900+k] = f_img_pad[:,i:i+kernel_dim,j:j+kernel_dim].flatten()
                k+=1
    
    return resultX

In [5]:
trainX.shape

(8000, 35, 30, 30)

In [6]:
trainY.shape

(8000, 2, 30, 30)

## Normalization

In [13]:
columns = ['datetime', 'Aspect', 'CBD', 'CBH', 'CC', 'CH', 'Elevation', 
           'No Data', 'Sparse', 'Tree', 'Shrub', 'Herb', 'Water', 'Barren', 
           'Developed', 'Snow-Ice', 'Agriculture', 'Slope', 'latitude', 'longitude',
           'Temp0', 'RelHumid0','UWind0', 'VWind0', 'Precipitate0',
           'Temp12', 'RelHumid12', 'UWind12', 'VWind12', 'Precipitate12',
           'observed1','observed2', 'observed3','observed4', 'observed5']
len(columns)

35

In [7]:
flat_train_X = trainX.transpose(0,2,3,1).reshape(trainX.shape[0]*trainX.shape[2]*trainX.shape[3],-1)
flat_train_X.shape

(7200000, 35)

startDictionary={
    'datetime':0,
    'landCover':1,
    'latitude':18,
    'longitude':19,
    'met0':20,
    'met1':25,
    'observed':30,
    'target':0
}

lengthDictionary={
    'datetime':1,
    'landCover':17,
    'latitude':1,
    'longitude':1,
    'met0':5,
    'met1':5,
    'observed':5,
    'target':2
}

#### Layers
* 0: Aspect 
* 1: Canopy Bulk Density
* 2: Canopy Base Height
* 3: Canopy Cover
* 4: Canopy Height
* 5: Elevelation
* 6 to 15: Vegetation (Fractional Veg Class per layer)
* 16: Slope

#### Vegetation Layers
* 6: No Data
* 7: Sparse
* 8: Tree
* 9: Shrub
* 10: Herb
* 11: Water
* 12: Barren
* 13: Developed
* 14: Snow-Ice
* 15: Agriculture


In [14]:
df_X = pd.DataFrame(flat_train_X, columns=columns)

In [15]:
df_X.describe()

Unnamed: 0,datetime,Aspect,CBD,CBH,CC,CH,Elevation,No Data,Sparse,Tree,...,Temp12,RelHumid12,UWind12,VWind12,Precipitate12,observed1,observed2,observed3,observed4,observed5
count,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,...,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0,7200000.0
mean,1.345524e+18,144.3239,4.348882,9.040829,25.34833,147.4523,1233.443,0.0143957,0.01244688,0.5615111,...,296.5911,35.96852,1.407599,0.4988841,8.150948e-06,0.072895,0.04347153,0.04760125,0.03125556,0.03433069
std,2498228000000000.0,85.4438,3.880711,12.37087,21.42645,126.0076,580.267,0.1176224,0.05425514,0.3787194,...,8.397094,21.28647,2.425977,2.162564,5.457135e-05,0.2599641,0.2039161,0.2129211,0.1740076,0.1820772
min,1.336594e+18,-1.0,0.0,-1.0,0.0,-1.0,-70.5632,0.0,0.0,0.0,...,267.8546,4.0,-6.283561,-10.39583,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.344768e+18,84.3104,0.6496,1.6256,3.6,17.96,854.5632,0.0,0.0,0.16,...,289.3503,19.0,-0.2941132,-0.9544525,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.345371e+18,149.6192,3.792,6.496,22.88,131.32,1327.39,0.0,0.0,0.6544,...,299.2336,29.0,1.609131,0.5146055,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.3461e+18,211.0928,6.9904,9.8496,43.512,261.24,1661.222,0.0,0.0,0.9328,...,303.7559,50.0,3.263401,1.999557,0.0,0.0,0.0,0.0,0.0,0.0
max,1.356988e+18,358.0,36.5792,100.0,91.0,461.2,3851.539,1.0,1.0,1.0,...,318.5415,100.0,10.43746,10.37196,0.00135,1.0,1.0,1.0,1.0,1.0


In [8]:
# features = [2,4,5,9,10,11,30,31,32,33,34]
features = [1,2,4,5,6,9,10,11,17,20,21,22,23,30,31,32,33,34]
print("N Features: ",len(features))

modelTrainX =  np.array([x[features] for x in trainX])
modelTrainY =  np.array([y[[0]] for y in trainY])
modelTestX =  np.array([x[features] for x in testX])
modelTestY =  np.array([y[[0]] for y in testY])
print(modelTrainX.shape)

N Features:  18
(8000, 18, 30, 30)


In [18]:
filterTrainX, filterTrainY = filterZeroData(modelTrainX, modelTrainY, minFires=30)
print(filterTrainX.shape)
print(filterTrainY.shape)

(3944, 18, 30, 30)
(3944, 1, 30, 30)


# Tabularizing

In [17]:
kernel_dim=5
# mtrX = filterTrainX.transpose(0,2,3,1).reshape(filterTrainX.shape[0]*filterTrainX.shape[2]*filterTrainX.shape[3],-1)
mtrX = kernelTransform(filterTrainX, 3)
mtrY = filterTrainY.transpose(0,2,3,1).flatten()
print(mtrX.shape, mtrY.shape)

(3549600, 162) (3549600,)


In [19]:
mteX = kernelTransform(modelTestX, 3)
mteY = modelTestY.transpose(0,2,3,1).flatten()
print(mteX.shape, mteY.shape)

(1800000, 162) (1800000,)


### Sub Sampling

In [31]:
nTrainSamples=10000
nTestSamples=1000
start = int(np.random.rand()*(mtrX.shape[0]-nTrainSamples))

In [32]:
smallX = mtrX[start:start+nTrainSamples]
smallY = mtrY[start:start+nTrainSamples]
smallTestX = mteX[:nTestSamples]
smallTestY = mteY[:nTestSamples]
# np.unique(smallY)
unique_elements, counts_elements = np.unique(smallY, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[0.000e+00 1.000e+00]
 [9.076e+03 9.240e+02]]


## SVM begins

In [33]:
kernels = ('linear', 'poly', 'rbf')
weights = {0:counts_elements[1], 1:counts_elements[0]}
print(weights)

{0: 924, 1: 9076}


In [None]:
clf = svm.SVC( gamma='scale', kernel=kernels[2], class_weight=weights)
clf.fit(smallX, smallY)

In [None]:
y_tr_p = clf.predict(smallX)

In [27]:
y_pred = clf.predict(smallTestX)

In [28]:
print("Accuracy:",metrics.accuracy_score(smallY, y_tr_p))
print("Precision:",metrics.precision_score(smallY, y_tr_p))
print("Recall:",metrics.recall_score(smallY, y_tr_p))
print("F1 Score:",metrics.f1_score(smallY, y_tr_p, average='weighted'))

Accuracy: 0.766
Precision: 0.1956521739130435
Recall: 0.8181818181818182
F1 Score: 0.8230230461557998


In [29]:
print("Accuracy:",metrics.accuracy_score(smallTestY, y_pred))
print("Precision:",metrics.precision_score(smallTestY, y_pred))
print("Recall:",metrics.recall_score(smallTestY, y_pred))
print("F1 Score:",metrics.f1_score(smallTestY, y_pred, average='weighted'))

Accuracy: 0.975
Precision: 0.0
Recall: 0.0
F1 Score: 0.9626582278481012


  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
np.unique(y_pred)

array([0.])

#### Even after improving subsampling and training with 10,000 data points we observe a bad performance

#### Appendix: POC for how to transpose the Data Matrix

In [159]:
a = np.zeros([2,2,2,2])
a[0][0][0][0] = 1
a[0][0][0][1] = 2
a[0][0][1][0] = 3
a[0][0][1][1] = 4
a[0][1][0][0] = 5
a[0][1][0][1] = 6
a[0][1][1][0] = 7
a[0][1][1][1] = 8
a[1][0][0][0] = 9
a[1][0][0][1] = 10
a[1][0][1][0] = 11
a[1][0][1][1] = 12
a[1][1][0][0] = 13
a[1][1][0][1] = 14
a[1][1][1][0] = 15
a[1][1][1][1] = 16
a

array([[[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]]],


       [[[ 9., 10.],
         [11., 12.]],

        [[13., 14.],
         [15., 16.]]]])

In [47]:
c = a.transpose(0,2,3,1)
b = c.reshape(c.shape[0]*c.shape[1]*c.shape[2],-1)
b.shape

(8, 4)

In [48]:
b

array([[ 1.,  5.,  0.,  0.],
       [ 2.,  6.,  0.,  0.],
       [ 3.,  7.,  0.,  0.],
       [ 4.,  8.,  0.,  0.],
       [ 9., 13.,  0.,  0.],
       [10., 14.,  0.,  0.],
       [11., 15.,  0.,  0.],
       [12., 16.,  0.,  0.]])

In [67]:
for i in range(8000):
    if modelTrainX[i][2][0][0] == 1:
        print (i)
        print(modelTrainX[i][0][0][0])
        print(modelTrainX[i][1][0][0])
        print(modelTrainX[i][2][0][0])
        print(modelTrainX[i][3][0][0])
        print(modelTrainX[i][4][0][0])
        break

24
0.0
0.0
1.0
1.0
0.0


In [68]:
mtrX[24*30*30]

array([0., 0., 1., 1., 0.])

They match!!

#### POC on how to filter zero vlaues prediction images

In [None]:
count=0
for i in range(len(filterTrainY)):
    if len(np.unique(filterTrainY[i]))<=1:
#         print(i)
        count+=1

print (count)

In [58]:
len(np.unique(filterTrainY))

2

#### POC on generating new features (like kernels)

In [112]:
filterTrainX.shape

(6852, 13, 30, 30)

In [114]:
mtrX.shape

(6166800, 13)

(13, 32, 32)


In [131]:
filterTrainX.shape

(6852, 13, 30, 30)

In [164]:
pad_width=1
pad = np.pad(a[0], ((0,0), (pad_width, pad_width), (pad_width, pad_width)), 'edge')
pad

array([[[1., 1., 2., 2.],
        [1., 1., 2., 2.],
        [3., 3., 4., 4.],
        [3., 3., 4., 4.]],

       [[5., 5., 6., 6.],
        [5., 5., 6., 6.],
        [7., 7., 8., 8.],
        [7., 7., 8., 8.]]])

In [199]:
f_img_pad[:,i:i+kernel_dim,j:j+kernel_dim].shape

(13, 3, 3)

In [211]:
f_img_pad[:,i:i+kernel_dim,j:j+kernel_dim].flatten().shape

(117,)

In [203]:
rx = np.ones([1, 117, 30, 30])
rx = 

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])