In [1]:
# conda install opencv

In [None]:
# import glob
# import cv2

# images = [cv2.imread(file) for file in glob.glob('./data/train/nowildfire/*.jpg')]

In [None]:
# len(images)

In [None]:
# images[0]

# Part 3: Image Classification Convolutional Neural Network

---

## Notebook Summary

This notebook explores our second model in which satellite images are classified to identify whether an image has or has not experienced a wildfire. This second model assumes that wildfire areas exhibit many similar terrestrial characteristics to areas prone to drought, such a drier vegetation and soils as well as potentially sparser vegetation. If a convolutional nearual network model could be trained on wildfire and no wildfire satellite images, then a similar model coud be used on satellite images to predict drought. In this notebook, the reader will find:

* Data Collection Methods
* Image Preprocessing
* Baseline CNN Model
* Model Tuning
* Production Model & Evaluation
* Notebook Conclusion

---

## Data Collection Methods

The following dataset was collected from a Kaggle Wildfire Prediction Dataset [source](https://www.kaggle.com/datasets/abdelghaniaaba/wildfire-prediction-dataset?select=train). The original dataset was compiled by the Kaggle contributor from Canada's government website, sourced from the government and municipalities of Quebec, which compiled images primarily from southern Quebec, dating back to 1976 [source](https://open.canada.ca/data/en/dataset/9d8f219c-4df0-4481-926f-8a2a532ca003). According to the Kaggle webpage, the wildfire images include those which contain greater than 0.01 acres burned. Upon cursory review of the images in both the "wildfire" and "nowildfire" classes, the "nowildfire" class contains images of both forested or green area as well as images of urban landscapes and human settlements, albeit not exclusively.

Although the original Kaggle data contains three separate train, test, and validation datasets, we have downloaded only the train dataset since it contains 30,250 satellite images in total, of which 15,750 are classified as "wildfire" (\~52%) and 14,500 are classified as "nowildfire" (\~48%). The train dataset will be read into this notebook and then split into train and test datasets for training and evaluating our image classification CNN.

In the subsequent section, we shall read in our image data and begin preprocessing any images, as necessary.

---

## Image Preprocessing

In this section, we will begin reading in the dataset and then conducting any requisite cleaning of the data and preprocessing to make our models train and fit correctly. We will begin by importing the requisite libraries and using the image_dataset_from_directory model to create a train/test split and preprocess data.

In [35]:
# import requisite libraries
import pandas as pd
import numpy as np
import os

from tensorflow.keras.utils import image_dataset_from_directory, load_img
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Rescaling

In [36]:
# General Assembly instructor, Alanna Besaw, recommended investigating image_dataset_from_directory module for manipulating this dataset
# original image sizes are 350 x 350 pixels
# create a training set of the data representing 75% of the images
img_train = image_dataset_from_directory('./data/train/',  
                                         validation_split = 0.25,
                                         image_size = (35, 35), # resized to exactly one tenth in size
                                         subset = 'training', 
                                         seed = 42)

# create a test set of the representing 25% of the images
img_test = image_dataset_from_directory('./data/train/', 
                                         validation_split = 0.25, 
                                         image_size = (35, 35),  # resized to exactly one tenth in size
                                         subset = 'validation', 
                                         seed = 42)

Found 30250 files belonging to 2 classes.
Using 22688 files for training.
Found 30250 files belonging to 2 classes.
Using 7562 files for validation.


In [37]:
# check the datatype of the train set
img_train

<_BatchDataset element_spec=(TensorSpec(shape=(None, 35, 35, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [38]:
# check the datatype of the test set
img_test

<_BatchDataset element_spec=(TensorSpec(shape=(None, 35, 35, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [24]:
directory = './data/train/nowildfire'

# Get a list of all items (files and directories) in the directory
all_items = os.listdir(directory)

# Filter out only the file names from the list
file_names = [item for item in all_items if os.path.isfile(os.path.join(directory, item))]

In [25]:
from struct import unpack
import os

marker_mapping = {
    0xffd8: "Start of Image",
    0xffe0: "Application Default Header",
    0xffdb: "Quantization Table",
    0xffc0: "Start of Frame",
    0xffc4: "Define Huffman Table",
    0xffda: "Start of Scan",
    0xffd9: "End of Image"
}


class JPEG:
    def __init__(self, image_file):
        with open(image_file, 'rb') as f:
            self.img_data = f.read()
    
    def decode(self):
        data = self.img_data
        while(True):
            marker, = unpack(">H", data[0:2])
            # print(marker_mapping.get(marker))
            if marker == 0xffd8:
                data = data[2:]
            elif marker == 0xffd9:
                return
            elif marker == 0xffda:
                data = data[-2:]
            else:
                lenchunk, = unpack(">H", data[2:4])
                data = data[2+lenchunk:]            
            if len(data)==0:
                break        


bads = []


for img in file_names:
  image = os.path.join(directory, img)
  image = JPEG(image) 
  try:
    image.decode()   
  except:
    bads.append(img)


for name in bads:
  os.remove(os.path.join(directory,name))

In [26]:
bads

[]

In [27]:
class_names = img_train.class_names
print(class_names)

['nowildfire', 'wildfire']


In [28]:
normalization_layer = Rescaling(1./255)


In [29]:
normalized_ds = img_train.map(lambda x, y: (normalization_layer(x), y))
image_batch, labels_batch = next(iter(normalized_ds))
first_image = image_batch[0]
# Notice the pixel values are now in `[0,1]`.
print(np.min(first_image), np.max(first_image))

0.0 0.84583336


In [30]:
model = Sequential()

# Make a convolutional layer with 32 filters.
model.add(Conv2D(32, 3, activation='relu', input_shape=(28, 28, 3)))

# MaxPool the results (basically a requirement)
model.add(MaxPooling2D(2))

# Let's add another convolution block
model.add(Conv2D(64, 3, activation='relu'))
model.add(MaxPooling2D(2))

# Finally, flatten the output and make a predictions through a dense layer.
model.add(Flatten())
model.add(Dense(2, activation='sigmoid'))

In [31]:
model.compile(
    loss='sparse_categorical_crossentropy', 
    optimizer='rmsprop',
    metrics=['acc']
)

In [32]:

# Fit!
history = model.fit(
    img_train,
    validation_data=(img_test),
    batch_size=512,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:

# # Fit!
# history = model.fit(
#     img_train,
#     validation_data=(img_test),
#     batch_size=128,
#     epochs=10
# )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
