In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [3]:
# Reading from HDF5
df = pd.read_hdf('../../data/CNN Disaster/data.h5', 'df')
df 

Unnamed: 0,disaster,image,label,height,width,size
18769,1.0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,124,124,15376
23335,0.0,"[[[13, 22, 15], [13, 22, 15], [13, 22, 15], [1...",0,124,124,15376
18627,1.0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,124,124,15376
18367,1.0,"[[[2, 3, 2], [2, 3, 2], [3, 3, 3], [2, 3, 2], ...",0,124,124,15376
14623,1.0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",0,124,124,15376
...,...,...,...,...,...,...
20951,0.0,"[[[19, 22, 24], [18, 21, 24], [14, 17, 19], [1...",0,124,124,15376
15603,1.0,"[[[2, 3, 3], [3, 4, 3], [8, 12, 11], [27, 37, ...",0,124,124,15376
23468,0.0,"[[[67, 89, 96], [67, 89, 96], [67, 89, 96], [6...",0,124,124,15376
14357,1.0,"[[[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], ...",3,124,124,15376


### Split, Balance and normalizing the dataset

This code splits the dataset into a 20/80 test train split. It also normalizes the RGB values from 0-1 and applies sampling to balance the two labels found in the dataset.

In [4]:
def balance_data(df, n_samples=None, random_state=42):
    new_df = pd.DataFrame()
    for label in df['disaster'].unique():
        label_df = df[df['disaster'] == label]
        if n_samples is None:
            n_samples = len(label_df)
        resampled_df = resample(label_df, replace=True, n_samples=n_samples, random_state=random_state)
        new_df = pd.concat([new_df, resampled_df], axis=0)
    return new_df

min_samples = min(df['disaster'].value_counts())

balanced__df = balance_data(df, n_samples=min_samples)

print("-------------------------------------")
print("After sampling data")

disaster_counts = balanced__df['disaster'].value_counts()
print("\nCounts for each disaster type:\n", disaster_counts)

total_records = disaster_counts.sum()
proportions = disaster_counts / total_records
print("\nProportions for each disaster type:\n", proportions)

print("\nDamage level distribution for Midwest flooding:")
print(balanced__df[balanced__df['disaster'] == 0]['label'].value_counts(normalize=True))

print("\nDamage level distribution for SoCal fire:")
print(balanced__df[balanced__df['disaster'] == 1]['label'].value_counts(normalize=True))

-------------------------------------
After sampling data

Counts for each disaster type:
 1.0    7004
0.0    7004
Name: disaster, dtype: int64

Proportions for each disaster type:
 1.0    0.5
0.0    0.5
Name: disaster, dtype: float64

Damage level distribution for Midwest flooding:
0    0.960166
1    0.017561
2    0.013135
3    0.009138
Name: label, dtype: float64

Damage level distribution for SoCal fire:
0    0.866077
3    0.121216
1    0.008709
2    0.003998
Name: label, dtype: float64


In [5]:
float_images = []
for image in balanced__df["image"].values:
    float_image = np.array(image).astype(np.float32)
    float_images.append(float_image)

float_images = np.array(float_images)/255

In [6]:
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(float_images, balanced__df[["disaster", "label"]], test_size=0.2)

### Rotate The Images

Rotating the images 180 degrees and then 90* degrees gives us the images rotated at 0, 90, 180, and 270 degrees giving us a lot more data to work with. 

\* On bigger images the 90 degree rotation were disabled due to the large amounts of data created and limits on GPU memory size.

In [7]:
y_train_cnn = y_train_cnn["disaster"]

In [8]:
from skimage.transform import rotate

def rotate_images(X_train, angle):
    num_images = X_train.shape[0]
    rotated_images = []

    for i in range(num_images):
        image = X_train[i]
        rotated_image = rotate(image, angle, preserve_range=True).astype(np.float32)  # Rotate image
        rotated_images.append(rotated_image)

    return np.array(rotated_images)

In [9]:
X_train_rotated = rotate_images(X_train_cnn, angle=180)

X_train_combined = np.concatenate([X_train_cnn, X_train_rotated], axis=0)

num_rotated_images = X_train_rotated.shape[0]
rotated_labels = y_train_cnn[:num_rotated_images] 

y_train_combined = np.concatenate([y_train_cnn, rotated_labels], axis=0)
X_train_combined, y_train_combined = shuffle(X_train_combined, y_train_combined, random_state=42)


In [10]:
"""
X_train_rotated = rotate_images(X_train_combined, angle=90)

X_train_combined = np.concatenate([X_train_combined, X_train_rotated], axis=0)

num_rotated_images = X_train_rotated.shape[0]
rotated_labels = y_train_combined[:num_rotated_images]

y_train_combined = np.concatenate([y_train_combined, rotated_labels], axis=0)
X_train_combined, y_train_combined = shuffle(X_train_combined, y_train_combined, random_state=42)   
""";

In [11]:
X_train_combined.shape

(22412, 124, 124, 3)

In [12]:
len(X_train_combined), len(X_test_cnn), len(X_test_cnn)/(len(X_train_combined)+ len(X_test_cnn))*100

(22412, 2802, 11.112873800269691)

### Rezising the images
Optinally resize the images, this operating is ran on multiple threads to utelize full system resources

In [13]:
import concurrent.futures
import numpy as np
from skimage.transform import resize

def resize_image(image):
    # Resize Image to (224, 224)
    return resize(image, (224, 224), anti_aliasing=True, preserve_range=True).astype(image.dtype)

def process_images(image_list):
    # Using threads, resize every image and return a list with all images
    with concurrent.futures.ThreadPoolExecutor() as executor:
        return list(executor.map(resize_image, image_list))

In [14]:
X_train_combined = process_images(X_train_combined)
X_test_cnn = process_images(X_test_cnn)

### Save Dataset

Test/Train dataset is saved to NPZ. The damage labels are also saved used to analyze the results of our predictions.

In [15]:
# Saving to NPZ
np.savez('../../data/CNN Disaster/train_data.npz', images=X_train_combined, labels=y_train_combined)
np.savez('../../data/CNN Disaster/test_data.npz', images=X_test_cnn, labels=y_test_cnn["disaster"].values, damage_labels=y_test_cnn["label"].values)