In [1]:
# conda install opencv

In [None]:
# import glob
# import cv2

# images = [cv2.imread(file) for file in glob.glob('./data/train/nowildfire/*.jpg')]

In [None]:
# len(images)

In [None]:
# images[0]

# Part 3: Image Classification Convolutional Neural Network

---

## Notebook Summary

This notebook explores our second model in which satellite images are classified to identify whether an image has or has not experienced a wildfire. This second model assumes that wildfire areas exhibit many similar terrestrial characteristics to areas prone to drought, such a drier vegetation and soils as well as potentially sparser vegetation. If a convolutional nearual network model could be trained on wildfire and no wildfire satellite images, then a similar model coud be used on satellite images to predict drought. In this notebook, the reader will find:

* Data Collection Methods
* Image Preprocessing
* Baseline CNN Model
* Model Tuning
* Production Model & Evaluation
* Notebook Conclusion

---

## Data Collection Methods

The following dataset was collected from a Kaggle Wildfire Prediction Dataset [source](https://www.kaggle.com/datasets/abdelghaniaaba/wildfire-prediction-dataset?select=train). The original dataset was compiled by the Kaggle contributor from Canada's government website, sourced from the government and municipalities of Quebec, which compiled images primarily from southern Quebec, dating back to 1976 [source](https://open.canada.ca/data/en/dataset/9d8f219c-4df0-4481-926f-8a2a532ca003). According to the Kaggle webpage, the wildfire images include those which contain greater than 0.01 acres burned. Upon cursory review of the images in both the "wildfire" and "nowildfire" classes, the "nowildfire" class contains images of both forested or green area as well as images of urban landscapes and human settlements, albeit not exclusively.

Although the original Kaggle data contains three separate train, test, and validation datasets, we have downloaded only the train dataset since it contains 30,250 satellite images in total, of which 15,750 are classified as "wildfire" (\~52%) and 14,500 are classified as "nowildfire" (\~48%). The train dataset will be read into this notebook and then split into train and test datasets for training and evaluating our image classification CNN.

In the subsequent section, we shall read in our image data and begin preprocessing any images, as necessary.

---

## Image Preprocessing

In this section, we will begin reading in the dataset and then conducting any requisite cleaning of the data and preprocessing to make our models train and fit correctly. We will begin by importing the requisite libraries and using the image_dataset_from_directory model to create a train/test split and preprocess data.

In [64]:
# import requisite libraries
import pandas as pd
import numpy as np
import os

from tensorflow.keras.utils import image_dataset_from_directory, load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Rescaling

In [65]:
# create list
nowildfire_arrays = []
# define filepath for Dog class
nowildfire_path = './data/train/nowildfire/'

# convert each image to normalized array and store
for file in os.listdir(nowildfire_path):
    try:
        # target_size automatically resizes each img on import
        nowildfire_img = load_img(nowildfire_path + file, target_size=(35, 35))
        nowildfire_arr = img_to_array(nowildfire_img) / 255
        nowildfire_arrays.append(nowildfire_arr)
    except:
        print(f'Error for file: {file}')

print(f'{len(nowildfire_arrays)} pictures converted.')

Error for file: .ipynb_checkpoints
14499 pictures converted.


In [66]:
nowildfire_arrays[0]

array([[[0.627451  , 0.63529414, 0.5921569 ],
        [0.67058825, 0.6627451 , 0.6156863 ],
        [0.36078432, 0.3647059 , 0.29411766],
        ...,
        [0.7294118 , 0.72156864, 0.67058825],
        [0.78039217, 0.77254903, 0.72156864],
        [0.76862746, 0.7607843 , 0.7137255 ]],

       [[0.6156863 , 0.62352943, 0.5803922 ],
        [0.68235296, 0.6745098 , 0.627451  ],
        [0.34117648, 0.34509805, 0.27450982],
        ...,
        [0.75686276, 0.7490196 , 0.69803923],
        [0.7607843 , 0.7529412 , 0.7019608 ],
        [0.75686276, 0.7490196 , 0.7019608 ]],

       [[0.61960787, 0.62352943, 0.5921569 ],
        [0.65882355, 0.6666667 , 0.6156863 ],
        [0.30980393, 0.34117648, 0.25882354],
        ...,
        [0.7647059 , 0.75686276, 0.70980394],
        [0.75686276, 0.7490196 , 0.7019608 ],
        [0.7176471 , 0.7176471 , 0.6784314 ]],

       ...,

       [[0.4627451 , 0.4862745 , 0.47843137],
        [0.65882355, 0.65882355, 0.6117647 ],
        [0.58431375, 0

In [67]:
nowildfire_arrays[0].shape

(35, 35, 3)

In [68]:
nowildfire_arrays[0].size

3675

In [69]:
nowildfire_arrays[0].reshape()

TypeError: reshape() takes exactly 1 argument (0 given)

In [70]:
# General Assembly instructor, Alanna Besaw, recommended investigating image_dataset_from_directory module for manipulating this dataset
# original image sizes are 350 x 350 pixels
# create a training set of the data representing 75% of the images
img_train = image_dataset_from_directory('./data/train/',  
                                         validation_split = 0.25,
                                         image_size = (35, 35), # resized to exactly one tenth in size
                                         subset = 'training', 
                                         seed = 42)

# create a test set of the representing 25% of the images
img_test = image_dataset_from_directory('./data/train/', 
                                         validation_split = 0.25, 
                                         image_size = (35, 35),  # resized to exactly one tenth in size
                                         subset = 'validation', 
                                         seed = 42)

Found 30250 files belonging to 2 classes.
Using 22688 files for training.
Found 30250 files belonging to 2 classes.
Using 7562 files for validation.


In [71]:
# check the datatype of the train set
img_train

<_BatchDataset element_spec=(TensorSpec(shape=(None, 35, 35, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [72]:
# check the datatype of the test set
img_test

<_BatchDataset element_spec=(TensorSpec(shape=(None, 35, 35, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

We now have the full image dataset in the form of a \_BatchDataset data type and split into bothtrain and test sets. Upon trying to train our first CNN model, it kept throwing an error which was indicating a corrupted image file, so we searched Stack Overflow for a method to make a list of all the files in a directory. The code is attributed and executed below.

In [73]:
# this block of code was adapted from a Stack Overflow post at https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
directory = './data/train/nowildfire'

# Get a list of all items (files and directories) in the directory
all_items = os.listdir(directory)

# Filter out only the file names from the list
file_names = [item for item in all_items if os.path.isfile(os.path.join(directory, item))]

After creating the list of files in the directory, we needed to iterate through all the files and identify which of the files might be corrupted and remove it from the directory. The following block of code is taken directly from Stack Overflow as well. The authors here have added comments to make sense of the code and how it identifies and removes a corrupted file.

In [74]:
# this block of was code was taken from a Stack Overflow post at https://stackoverflow.com/questions/62586443/tensorflow-error-when-trying-transfer-learning-invalid-jpeg-data-or-crop-windo

# imports required to read in and unpack files
from struct import unpack
import os

# maps parts of image onto markers
marker_mapping = {
    0xffd8: "Start of Image",
    0xffe0: "Application Default Header",
    0xffdb: "Quantization Table",
    0xffc0: "Start of Frame",
    0xffc4: "Define Huffman Table",
    0xffda: "Start of Scan",
    0xffd9: "End of Image"
}

# establishes the JPEG class
class JPEG:
    # initiates an object within the class by opening an image file and reading it in
    def __init__(self, image_file):
        with open(image_file, 'rb') as f:
            self.img_data = f.read()
    
    # decodes an image by checking multiple markers to verify whether they are true
    def decode(self):
        data = self.img_data
        while(True):
            marker, = unpack(">H", data[0:2])
            # print(marker_mapping.get(marker))
            if marker == 0xffd8:
                data = data[2:]
            elif marker == 0xffd9:
                return
            elif marker == 0xffda:
                data = data[-2:]
            else:
                lenchunk, = unpack(">H", data[2:4])
                data = data[2+lenchunk:]            
            if len(data)==0:
                break        

# creates an empty list of bad images (corrupted data)
bads = []

# loops through the list of file names in the nowildfire directory
# tries to decode the image to verify it is an uncorrupted jpg
# if it fails to decode, adds corrupted jpg file to the bads list
for img in file_names:
  image = os.path.join(directory, img)
  image = JPEG(image) 
  try:
    image.decode()   
  except:
    bads.append(img)

# loops through the bads list and removes corrupted files from directory
for name in bads:
  os.remove(os.path.join(directory,name))

In [75]:
# in the first execution of this code, only one file was added to the list but has since been removed
bads

[]

In [76]:
class_names = img_train.class_names
print(class_names)

['nowildfire', 'wildfire']


In [77]:
normalization_layer = Rescaling(1./255)


In [78]:
normalized_ds = img_train.map(lambda x, y: (normalization_layer(x), y))
image_train_batch, labels_train_batch = next(iter(normalized_ds))
first_image = image_train_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))

0.0 0.83921576


In [79]:
image_batch

<tf.Tensor: shape=(32, 35, 35, 3), dtype=float32, numpy=
array([[[[0.1637255 , 0.21666668, 0.12843138],
         [0.00490196, 0.1392157 , 0.01176471],
         [0.21862747, 0.34509805, 0.18235295],
         ...,
         [0.47156864, 0.5852941 , 0.4539216 ],
         [0.5784314 , 0.5735294 , 0.454902  ],
         [0.47450984, 0.4156863 , 0.3421569 ]],

        [[0.18529412, 0.3147059 , 0.19705884],
         [0.3421569 , 0.39705884, 0.32450983],
         [0.31274512, 0.36666667, 0.33235297],
         ...,
         [0.37549022, 0.47450984, 0.34607846],
         [0.08431373, 0.2529412 , 0.07843138],
         [0.15784314, 0.34803924, 0.15882353]],

        [[0.41176474, 0.46274513, 0.3647059 ],
         [0.2911765 , 0.39607847, 0.2901961 ],
         [0.5294118 , 0.56274515, 0.49509805],
         ...,
         [0.1382353 , 0.3284314 , 0.14803922],
         [0.12843138, 0.30882356, 0.12745099],
         [0.1382353 , 0.32156864, 0.13431373]],

        ...,

        [[0.48823532, 0.5254902 , 0

In [80]:
labels_batch

<tf.Tensor: shape=(32,), dtype=int32, numpy=
array([0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1], dtype=int32)>

In [81]:
normalized_ds = img_test.map(lambda x, y: (normalization_layer(x), y))
image_test_batch, labels_test_batch = next(iter(normalized_ds))
first_image = image_test_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))

0.021568628 0.9088236


In [82]:
model = Sequential()

# Make a convolutional layer with 32 filters.
model.add(Conv2D(32, 3, activation='relu', input_shape=(35, 35, 3)))

# MaxPool the results (basically a requirement)
model.add(MaxPooling2D(2))

# Let's add another convolution block
model.add(Conv2D(64, 3, activation='relu'))
model.add(MaxPooling2D(2))

# Finally, flatten the output and make a predictions through a dense layer.
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))

In [83]:
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='rmsprop',
    metrics=['acc']
)

In [84]:

# Fit!
history = model.fit(
    image_train_batch,
    validation_data=(image_test_batch),
    batch_size=512,
    epochs=10
)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [37]:

# # Fit!
# history = model.fit(
#     img_train,
#     validation_data=(img_test),
#     batch_size=128,
#     epochs=10
# )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
