In [None]:
import pandas as pd
import kagglehub
import os
import matplotlib.pyplot as plt
import cv2
import numpy as np
import random
from collections import defaultdict
import tensorflow as tf
from tensorflow.keras.applications import VGG16 , ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D , Input , Dense , Dropout , LayerNormalization , Add
from tensorflow.keras import Model , layers
from tensorflow.keras.callbacks import EarlyStopping , ReduceLROnPlateau
from tensorflow.keras.utils import plot_model

In [None]:

# Download latest version
path = kagglehub.dataset_download("akashgundu/signature-verification-dataset")

print("Path to dataset files:", path)

In [None]:
data = []
for dir in os.listdir(os.path.join(path , 'extract')):
  for sub_dir in os.listdir(os.path.join(path , 'extract' , dir)):
    if sub_dir.endswith('.jpg'):
      data.append({'image_path': os.path.join(path , 'extract' , dir , sub_dir), 'person_id': dir})

data = pd.DataFrame(data)
grouped = data.groupby('person_id').agg(list).reset_index()

In [None]:
def load_image(image_path, label=''):
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Image not found at {image_path}")
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, (224, 224))
    image = image / 255.0
    return np.array(image, dtype=np.float32)

def generate_triplets(number_of_triplets):
    anchor = []
    positive = []
    negative = []
    for _ in range(number_of_triplets):
        index = random.sample(range(10), 3)
        genuine_rows = grouped[grouped["person_id"].apply(lambda x: len(x.split("_")) == 1)]
        random_genuine_row = genuine_rows.sample(1)

        first_genuine_image = load_image(random_genuine_row['image_path'].iloc[0][index[0]])
        second_genuine_image = load_image(random_genuine_row['image_path'].iloc[0][index[1]])
        forged_rows = grouped[grouped["person_id"].apply(lambda x: len(x.split("_")) != 1)]
        if len(forged_rows) < 1:
            raise ValueError("Not enough forged rows to sample from.")
        random_forged_row = forged_rows.sample(1)

        forged_image = load_image(random_forged_row['image_path'].iloc[0][index[2]])
        anchor.append(first_genuine_image)
        positive.append(second_genuine_image)
        negative.append(forged_image)

    return np.array(anchor), np.array(positive), np.array(negative)

In [None]:
def triplet_loss(y_true, y_pred, alpha=0.5):
    anchor, positive, negative = y_pred[0], y_pred[1], y_pred[2]
    # Calculate Euclidean distance between anchor and positive, and anchor and negative
    distance1 = tf.sqrt(tf.reduce_sum(tf.square(anchor - positive), axis=-1))
    distance2 = tf.sqrt(tf.reduce_sum(tf.square(anchor - negative), axis=-1))

    # Compute the triplet loss with margin alpha
    loss = tf.reduce_mean(tf.maximum(distance1 - distance2 + alpha, 0))
    print(distance1)
    print(distance2)
    return loss

In [None]:
def create_cnn_embedding_model(embedding_dim=128, input_shape=(224, 224, 3)):
    inputs = tf.keras.Input(shape=input_shape)

    # CNN layers
    x = layers.Conv2D(64, 3, activation='relu', padding='same')(inputs)
    x = layers.MaxPooling2D(2, 2)(x)

    x = layers.Conv2D(128, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling2D(2, 2)(x)

    x = layers.Conv2D(256, 3, activation='relu', padding='same')(x)
    x = layers.MaxPooling2D(2, 2)(x)

    x = layers.Conv2D(512, 3, activation='relu', padding='same')(x)
    x = layers.Dropout(0.4)(x)
    x = layers.MaxPooling2D(2, 2)(x)

    x = layers.Conv2D(1024, 3, activation='relu', padding='same')(x)
    x = layers.Dropout(0.5)(x)
    x = layers.MaxPooling2D(2, 2)(x)

    embeddings = GlobalAveragePooling2D()(x)

    # Create model
    model = Model(inputs=inputs, outputs=embeddings)

    return model

model = create_cnn_embedding_model()
model.summary()

In [None]:
reduce_lr = ReduceLROnPlateau(
    monitor='loss',
    factor=0.5,
    patience=2,
    verbose=1,
    min_lr=1e-6
)
batch_size = 100
n_of_samples = 1000
a , p , n = generate_triplets(n_of_samples) #return np.array(anchors) , np.array(positives) , np.array(negatives)
model.compile(loss=triplet_loss,optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
y_train = np.zeros((n_of_samples, 3))
dataset = tf.data.Dataset.from_tensor_slices((a, p, n)).batch(batch_size).shuffle(buffer_size=1024)
history = model.fit(dataset,epochs=30,callbacks=[reduce_lr])

***Loading the CNN To get the embeddings***

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_curve, auc

genuine_embedding = []
forged_embedding = []
genuine_rows = grouped[grouped["person_id"].apply(lambda x: len(x.split("_")) == 1)]
forged_rows = grouped[grouped["person_id"].apply(lambda x: len(x.split("_")) != 1)]
genuine_labels = np.ones(len(genuine_rows))
forged_labels = np.zeros(len(forged_rows))
for index , row in genuine_rows.iterrows():
  image = load_image(row['image_path'][0])
  embedding = model.predict(np.expand_dims(image, axis=0))
  genuine_embedding.append(embedding[0])
for index , row in forged_rows.iterrows():
  image = load_image(row['image_path'][0])
  embedding = model.predict(np.expand_dims(image, axis=0))
  forged_embedding.append(embedding[0])
  

In [None]:
genuine_labels = np.array(genuine_labels).reshape(-1, 1)
forged_labels = np.array(forged_labels).reshape(-1, 1)
genuine_signatures = np.hstack((np.squeeze(np.array(genuine_embedding)) , np.array(genuine_labels)))
forged_signatures = np.hstack((np.squeeze(np.array(forged_embedding)), np.array(forged_labels)))

In [None]:
# Combine both datasets
final_dataset = np.vstack((genuine_signatures, forged_signatures))
# Shuffle the dataset
np.random.shuffle(final_dataset)
# Separate features (X) and labels (y)
X = final_dataset[:, :-1]  # Features
y = final_dataset[:, -1]   # Labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df_train = pd.DataFrame(X_train)
df_train['label'] = y_train
df_train.to_csv('/content/drive/MyDrive/train_data.csv', index=False)

df_test = pd.DataFrame(X_test)
df_test['label'] = y_test
df_test.to_csv('/content/drive/MyDrive/test_data.csv', index=False)

In [None]:

rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest classifier
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()
