<a href="https://colab.research.google.com/github/ashley-ferreira/PHYS449_FinalProject/blob/main/notebooks/KerasC1_our_data_40x_trial1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install astroNN

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting astroNN
  Downloading astroNN-1.0.1.tar.gz (9.3 MB)
[K     |████████████████████████████████| 9.3 MB 16.0 MB/s 
Collecting astroquery
  Downloading astroquery-0.4.6-py3-none-any.whl (4.5 MB)
[K     |████████████████████████████████| 4.5 MB 76.4 MB/s 
Collecting keyring>=4.0
  Downloading keyring-23.11.0-py3-none-any.whl (36 kB)
Collecting pyvo>=1.1
  Downloading pyvo-1.4-py3-none-any.whl (885 kB)
[K     |████████████████████████████████| 885 kB 80.1 MB/s 
Collecting jeepney>=0.4.2
  Downloading jeepney-0.8.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 6.6 MB/s 
[?25hCollecting jaraco.classes
  Downloading jaraco.classes-3.2.3-py3-none-any.whl (6.0 kB)
Collecting SecretStorage>=3.2
  Downloading SecretStorage-3.3.3-py3-none-any.whl (15 kB)
Collecting cryptography>=2.0
  Downloading cryptography-38.0.4-cp36-abi3-manylinux_2_24_x86_64.whl (4.0 MB)

In [2]:
#from astroNN.datasets import galaxy10
#from astroNN.datasets.galaxy10 import galaxy10cls_lookup
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.utils import to_categorical
from keras.preprocessing import image
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
#Import google drive (need to put data folder as shortcut in your local drive My Drive):
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Full data**

In [4]:
#LOAD THE DATA FROM TXT FILE INTO A BATCH:
def data_batch(datafile_index, num_images, data_file='MyDrive/data_g_band_v2.txt', plotting=False):
    '''
    Description:
        Access datafile.txt, each row is flattened 110x110 image + 1 label string (E, Sp, S0, Irr+Misc).
        Returns an augmented batch of num_images X 40.
        The labels are converted to 1D vectors (ex: Sp = [0,0,1,0])
        Need to give a datafile_index that tells which rows to pick.
    Inputs:
        datafile_index: index of row in datafile to load. loads rows datafile_index to datafile_index+num_images.
        num_images: number of different images to load per batch, total batch size 
        is 40 X num_images. (default: 10 (for 40X10 = 400 batch size like in paper)
        data_file: datafile full path, need to add shortcut to local Drive. (default: '/content/drive/MyDrive/data/data_g_band.txt')
    Outputs:
        tensor_input_batch_aug: dimensions: (100, 100, num_images X 40).
        tensor_label_batch_aug: dimensions: (num_images X 40, 4)
    '''

    #Take batch of num_images rows from datafile:
    with open(data_file, 'r') as f:
        rows = f.readlines()[datafile_index:(datafile_index+num_images)]

    #for batch size of 400 (augmented), need 10 images
    data_batch = np.zeros((num_images,12101), dtype=np.dtype('U10'))
    count = 0
    for row in rows:
        data_batch[count,:] = row.split()
        count += 1

    #separate label and input:
    input_batch_flat = np.array(data_batch[:,:12100], dtype=float)#, dtype=int)
    label_batch = np.array(data_batch[:,-1])

    #convert input batch back to a 2D array:
    input_batch = np.zeros((110,110,np.shape(input_batch_flat)[0]))#, dtype=int)
    for ii in range(np.shape(input_batch_flat)[0]):
        input_batch[:,:,ii] = np.reshape(input_batch_flat[ii,:], (110,110))


    #convert label batch into into 1D vector: 
    #E=0, S0=1, Sp=2, Irr+Misc=3
    #ex: label = [0,0,1,0] ==> Sp galagy
    arr_label_batch = np.zeros((np.shape(label_batch)[0],4), dtype=int)
    arr_label_batch[:,0] = np.array([label_batch == 'E'], dtype=int)
    arr_label_batch[:,1] = np.array([label_batch == 'Sp'], dtype=int)
    arr_label_batch[:,2] = np.array([label_batch == 'S0'], dtype=int)
    arr_label_batch[:,3] = np.array([label_batch == 'Irr+Misc'], dtype=int)

    if plotting == True:
      #test with image plotted
      plt.imshow(input_batch[:,:,0])
      plt.show()

    #NOW AUGMENT THE BATCH (40X more):
    input_batch_aug = np.empty((100,100,np.shape(input_batch)[2]*40), dtype=int)
    arr_label_batch_aug = np.empty((np.shape(arr_label_batch)[0]*40, 4), dtype=int)

    count = 0
    for ll in range(np.shape(input_batch)[2]):
        #Crop 5X more image (100X100 pixels)
        C1 = input_batch[:100,:100,ll]
        C2 = input_batch[10:,:100,ll]
        C3 = input_batch[:100,10:,ll]
        C4 = input_batch[10:,10:,ll]
        C5 = input_batch[5:105,5:105,ll]

        C = [C1, C2, C3, C4, C5]

        for kk in range(5):
            #Rotate 4X more image (by 90 deg)
            for jj in range(4):
                C_R = np.rot90(C[kk], k=jj)
                input_batch_aug[:,:,count] = C_R
                arr_label_batch_aug[count,:] = arr_label_batch[ll,:]
                count += 1
                
                input_batch_aug[:,:,count] = np.swapaxes(C_R,0,1)
                arr_label_batch_aug[count,:] = arr_label_batch[ll,:]
                count += 1


    #PUT THE DATA AS A PYTORCH TENSOR:
    #tensor_input_batch_aug = torch.Tensor(input_batch_aug)
    #tensor_label_batch_aug = torch.Tensor(arr_label_batch_aug)
    
    return input_batch_aug, arr_label_batch_aug

# **Our augmented data**

In [5]:
# #LESS DATA AUGMENTION: crop only = X5 augmentation:

# #AUGMENT ONLY X5 (ONLY BY CROPPING)
# def data_batch_aug5(datafile_index, num_images=10,  data_file='/content/drive/MyDrive/data_g_band.txt', plotting=False):
#     '''
#     Description:
#         Access datafile.txt, each row is flattened 110x110 image + 1 label string (E, Sp, S0, Irr+Misc).
#         Returns an augmented batch of num_images X 5.
#         The labels are converted to 1D vectors (ex: Sp = [0,0,1,0])
#         Need to give a datafile_index that tells which rows to pick.
#     Inputs:
#         datafile_index: index of row in datafile to load. loads rows datafile_index to datafile_index+num_images.
#         num_images: number of different images to load per batch, total batch size 
#         is 5 X num_images. (default: 10 (for 5X10 = 400 batch size like in paper)
#         data_file: datafile full path, need to add shortcut to local Drive. (default: '/content/drive/MyDrive/data/data_g_band.txt')
#     Outputs:
#         tensor_input_batch_aug: dimensions: (100, 100, num_images X 5). 
#         tensor_label_batch_aug: dimensions: (num_images X 5, 4)
#     '''

#     #data_file = 'data_g_band.txt'

#     #Take batch of num_images rows from datafile:
#     with open(data_file, 'r') as f:
#         rows = f.readlines()[datafile_index:(datafile_index+num_images)]

#     #for batch size of 400 (augmented), need 10 images
#     data_batch = np.zeros((num_images,12101), dtype=np.dtype('U10'))
#     count = 0
#     for row in rows:
#         data_batch[count,:] = row.split()
#         count += 1

#     #separate label and input:
#     input_batch_flat = np.array(data_batch[:,:12100], dtype=int)
#     label_batch = np.array(data_batch[:,-1])

#     #convert input batch back to a 2D array:
#     input_batch = np.empty((110,110,np.shape(input_batch_flat)[0]), dtype=int)
#     for ii in range(np.shape(input_batch_flat)[0]):
#         input_batch[:,:,ii] = np.reshape(input_batch_flat[ii,:], (110,110))


#     #convert label batch into into 1D vector: 
#     #E=0, S0=1, Sp=2, Irr+Misc=3
#     #ex: label = [0,0,1,0] ==> Sp galagy
#     arr_label_batch = np.empty((np.shape(label_batch)[0],4), dtype=int)
#     arr_label_batch[:,0] = np.array([label_batch == 'E'], dtype=int)
#     arr_label_batch[:,1] = np.array([label_batch == 'Sp'], dtype=int)
#     arr_label_batch[:,2] = np.array([label_batch == 'S0'], dtype=int)
#     arr_label_batch[:,3] = np.array([label_batch == 'Irr+Misc'], dtype=int)

#     #test with image plotted
#     if plotting == True:
#       plt.imshow(input_batch[:,:,0])
#       plt.show()

#     #NOW AUGMENT THE BATCH (5X more):
#     how_much_augment = 5
#     input_batch_aug = np.empty((100,100,np.shape(input_batch)[2]*how_much_augment), dtype=int)
#     arr_label_batch_aug = np.empty((np.shape(arr_label_batch)[0]*how_much_augment, 4), dtype=int)

#     count = 0
#     for ll in range(np.shape(input_batch)[2]):
#         #Crop 5X more image (100X100 pixels)
#         C1 = input_batch[:100,:100,ll]
#         C2 = input_batch[10:,:100,ll]
#         C3 = input_batch[:100,10:,ll]
#         C4 = input_batch[10:,10:,ll]
#         C5 = input_batch[5:105,5:105,ll]

#         C = [C1, C2, C3, C4, C5]

#         for kk in range(5):
#             input_batch_aug[:,:,count] = C[kk]
#             arr_label_batch_aug[count,:] = arr_label_batch[ll,:]
#             count += 1

#     #PUT THE DATA AS A PYTORCH TENSOR:
#     #tensor_input_batch_aug = torch.Tensor(input_batch_aug)
#     #tensor_label_batch_aug = torch.Tensor(arr_label_batch_aug)
    
#     return input_batch_aug, arr_label_batch_aug

# '''
# #Test above function:
# rand_index = np.random.permutation(1403) #10 images
# rand_train = rand_index[:200] #arbitrary values
# rand_test = rand_index[200:]

# #Use this loop for training over entire dataset at each epochs
# for ii in range(np.shape(rand_train)[0]):
#   image_batch, label_batch = data_batch_aug5(datafile_index=10*rand_train[ii], num_images=10)
#   ##print(np.shape(image_batch))
#   ##print(np.shape(label_batch))
#   ##print(label_batch)
#   #Check: 10 images X 5 augmentation = 100 x 100 x 50 tensor size
#   #check: label size is 10 x 5 = 50 x 4 (4 labels)
#   #check: label is 5 type in a row then another 5 in a row etc ...
# '''

# **Get data for a select index**

In [6]:
input_batch_aug, arr_label_batch_aug = data_batch(datafile_index=0, num_images=7500,  data_file='/content/drive/MyDrive/data_g_band_v2.txt', plotting=False)
print(input_batch_aug.shape)
len_input = len(input_batch_aug.T)
#input_batch_aug.reshape((1, 100, 100, 9000))
#print(input_batch_aug.shape)
#print(arr_label_batch_aug.shape)

(100, 100, 300000)


In [7]:
#print(input_batch_aug.shape)
input_batch_aug_final = input_batch_aug.reshape(1, 100, 100, len_input)
print(input_batch_aug_final.shape)
#print(arr_label_batch_aug.shape)

(1, 100, 100, 300000)


# **Data split**

In [8]:
X_train, X_test, y_train, y_test = train_test_split(input_batch_aug_final.T, arr_label_batch_aug, test_size = 0.15)
print(X_train.shape)
print(y_train.shape)

(255000, 100, 100, 1)
(255000, 4)


In [9]:
model = Sequential()
model.add(Conv2D(filters=32, input_shape=(100,100,1), activation='relu', kernel_size=(5,5)))
model.add(Conv2D(filters=64, input_shape=(100,100,1), activation='relu', kernel_size=(5,5)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dense(4, activation='softmax')) 
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 96, 96, 32)        832       
                                                                 
 conv2d_1 (Conv2D)           (None, 92, 92, 64)        51264     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 46, 46, 64)       0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 135424)            0         
                                                                 
 dropout (Dropout)           (None, 135424)            0         
                                                                 
 dense (Dense)               (None, 256)               34668800  
                                                        

In [10]:
optimizer = tf.keras.optimizers.Adadelta(learning_rate=0.0002)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, validation_split=0.15)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fcd0d744910>

In [13]:
plt.plot(model.history.history['loss'],color='b', label='Training Loss')
plt.plot(model.history.history['val_loss'],color='r', label='Validation Loss')
plt.legend()
plt.show()
plt.plot(model.history.history['accuracy'],color='b', label='Training  Accuracy')
plt.plot(model.history.history['val_accuracy'],color='r', label='Validation Accuracy')
plt.legend()
plt.show()

KeyError: ignored

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from pylab import rcParams
rcParams['figure.figsize'] = 10,10
rcParams.update({'font.size': 15})

predictions = model.predict(X_test)
predictions = (predictions > 0.5) 
labels = ['E', 'S0', 'Sp', 'Irr']

cm = confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1), normalize='true')
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot(cmap=plt.cm.Blues)
plt.title('C1 Confusion Matrix')
#plt.facecolor("white")
plt.show()

results = model.evaluate(X_test, y_test)
print("C1 --> test loss, test acc:", results)