In [12]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd
import keras
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Model


In [2]:
from tensorflow.keras.datasets import mnist
from pyspark.sql.types import StructType,StructField, StringType, FloatType, ArrayType

In [3]:
import pandas as pd

In [4]:
# train, test load data
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Normalize values to [0, 1]
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

In [5]:
# PY SPARK VERSION OF THE DATAFRAME
# def get_ps_dataFrame(x_data,y_data):
#     data_unified = [( x_data[i].tolist(), str(y_data[i])) for i in range(len(x_data))]
#     schema = StructType([ 
#         StructField("image",ArrayType(ArrayType(FloatType())),False), 
#         StructField("class", StringType(), False) 
#       ])
#     df = spark.createDataFrame(data=data_unified,schema=schema)
#     return df
# train_df = get_ps_dataFrame(x_train, y_train)

In [6]:
# print((df.count(), len(df.columns)))

In [7]:
def get_pd_dataFrame(x_data,y_data):
    return pd.DataFrame(data={'image':x_data.tolist(), 'class':y_data.tolist()})
train_df = get_pd_dataFrame(x_train, y_train)

In [8]:
def augment_data_cross_same_class_pair(df:pd.DataFrame):
    sorted_df = df.sort_values("class").reset_index(drop=True)
    shuffled_sorted_df = sorted_df.sample(frac=1).reset_index(drop=True).sort_values("class").reset_index(drop=True)
    shuffled_sorted_df.rename(columns={'image':'imagePair',"class": "classPair"}, inplace=True)
    result = pd.concat([sorted_df, shuffled_sorted_df], axis=1)
    return result[['image','imagePair', 'class']].sample(frac=1).reset_index(drop=True)

same_class_cross_df = augment_data_cross_same_class_pair(train_df)

In [13]:
import tensorflow as tf
def series_to_tensor(series):
    tensor_list = series.apply(lambda x: tf.constant(x))
    return tf.stack(tensor_list)
    
def prepare_train_traditional_autoencoder_data(train_df):
    image_tensor = series_to_tensor(train_df['image'])
    in_x_train = image_tensor
    out_x_train = image_tensor
    return [in_x_train, out_x_train]

def prepare_train_cross_same_class_autoencoder_data(train_df, additional_same_class_df):
    original_df =  train_df.copy()
    original_df["imagePair"] = original_df["image"]
    concat_df = pd.concat([original_df, additional_same_class_df])
    concat_df = concat_df.sample(frac=1).reset_index(drop=True)
    in_x_train = series_to_tensor(concat_df['image'])
    out_x_train = series_to_tensor(concat_df['imagePair'])
    return [in_x_train, out_x_train]

In [14]:
[trad_ae_x_in,  trad_ae_x_out]= prepare_train_traditional_autoencoder_data(train_df)

[cross_same_class_ae_x_in,  cross_same_class_ae_x_out]= prepare_train_cross_same_class_autoencoder_data(train_df, same_class_cross_df)



In [15]:

input_img = keras.Input(shape=(28, 28, 1))

x = layers.Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
x = layers.MaxPooling2D((2, 2), padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = layers.MaxPooling2D((2, 2), padding='same')(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
encoded = layers.MaxPooling2D((2, 2), padding='same')(x)

# at this point the representation is (4, 4, 8) i.e. 128-dimensional

x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
x = layers.UpSampling2D((2, 2))(x)
x = layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
x = layers.UpSampling2D((2, 2))(x)
x = layers.Conv2D(16, (3, 3), activation='relu')(x)
x = layers.UpSampling2D((2, 2))(x)
decoded = layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

autoencoder = keras.Model(input_img, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

In [16]:
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 28, 28, 1)]       0         
                                                                 
 conv2d (Conv2D)             (None, 28, 28, 16)        160       
                                                                 
 max_pooling2d (MaxPooling2  (None, 14, 14, 16)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 14, 14, 8)         1160      
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 7, 7, 8)           0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 8)           584   

In [17]:
autoencoder.fit(trad_ae_x_in, trad_ae_x_in,
                epochs=2,#100
                batch_size=256,
                shuffle=True,
                validation_data=(x_test, x_test))

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7f0a28c45690>

In [33]:
from keras import layers
class Autoencoder(keras.Model):
    def __init__(self, latent_dim, input_shape ):
        super(Autoencoder, self).__init__(input_shape,input_shape)
        self.latent_dim = latent_dim
        self.shape = input_shape
        self.encoder = tf.keras.Sequential([
            input_shape,
            layers.Conv2D(16, (3, 3), activation='relu'),
            layers.MaxPooling2D((2,2), padding='same'),
            layers.Conv2D(8, (3, 3), activation='relu', padding='same'),
            layers.MaxPooling2D((2,2), padding='same'),
            layers.Conv2D(8, (3, 3), activation='relu', padding='same'),
            layers.MaxPooling2D((2,2), padding='same'),
        ])
        encoder_output_shape = self.encoder.layers[-1].output_shape
        self.decoder = tf.keras.Sequential([
            layers.Conv2D(8, (3, 3), activation='relu', padding='same', input_shape=encoder_output_shape[1:]),
            layers.UpSampling2D((2, 2)),
            layers.Conv2D(8, (3, 3), activation='relu', padding='same'),
            layers.UpSampling2D((2, 2)),
            layers.Conv2D(16, (3, 3), activation='relu'),
            layers.UpSampling2D((2, 2)),
            layers.Conv2D(1, (3, 3), activation='sigmoid', padding='same'),
        ])

    def call(self, inputs,training = False):
        encoded = self.encoder(inputs)
        decoded = self.decoder(encoded)
        return decoded


input_shape = keras.Input(shape=(28, 28, 1))
latent_dim = 2
ae = Autoencoder(latent_dim, input_shape)
ae.compile(optimizer='adam', loss='binary_crossentropy')

In [34]:
ae.summary()

Model: "autoencoder_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_15 (InputLayer)       [(None, 28, 28, 1)]       0         
                                                                 
 sequential_2 (Sequential)   (None, 4, 4, 8)           1904      
                                                                 
 sequential_3 (Sequential)   (None, 28, 28, 1)         2481      
                                                                 
Total params: 4385 (17.13 KB)
Trainable params: 4385 (17.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [36]:

ae.fit(trad_ae_x_in, trad_ae_x_in,
                epochs=3,#100
                batch_size=256,
                shuffle=True,
                validation_data=(x_test, x_test),
                workers=6,
                use_multiprocessing=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7f0a1948e860>

In [27]:
# train autoencoder
# plot loss ae




In [96]:
# prepare data aumentated for same class match
# train VAE 
# plot loss vae

In [97]:
# calculate encoded_vector_ae using autoencoder
# calculate encoded_vector_vae using Variational autoencoder

In [99]:
# create function to calculate metric topK class metric
# calculateMetric(encoded_vector_ae)
# calculateMetric(encoded_vector_vae)

In [100]:
# data visualisation metrics

In [101]:
# discuss results

In [None]:
# if there is time, apply in another dataset