In [504]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np


## Data exploration and preparation

>#### load 'A_Z Handwritten Data.csv' dataset

In [505]:
df = pd.read_csv('A_Z Handwritten Data.csv')

In [506]:
# get copy from the original to preprocess

df_pre = df.copy()

In [None]:
# get information about the dataset

display(df_pre.describe())


print("-"*80)


# to know the number of the rows
print(f"total records:",len(df), "\n")


>#### Identify the number of unique classes and show their distribution.


In [None]:
# We know that the fisrt column is the target 
# contains numbers from 0 to 25 (A-Z).

# Count the frequency of each letter
# df_pre["0"] to get the first column as its name is "0" from the above information
unique_classes, counts = np.unique(df_pre["0"], return_counts=True)

print("Unique classes and their counts:")
for cls, count in zip(unique_classes, counts):
    print(f"{chr(cls + ord('A'))}: {count}")
print("-" * 80)

# show the distribution
plt.bar(unique_classes, counts)
plt.title("Distribution of Classes")
plt.xlabel("Class Labels (A-Z)")
plt.ylabel("Frequency")
plt.show()


>#### Normalize each image. 


In [509]:
# Normalize images
# divide by 255 to make the data between 0 and 1
df_normalized = df_pre.astype("float32") / 255.0


>#### Reshape the flattened vectors to reconstruct and display the corresponding images while testing the models. 

In [None]:

# Separate the target column (first column)
# we get it from the original dataset not from the normalized dataset 
# iloc[Rows , Columns] , ":" means all records , "0" mean the first column 
df_targets = df_pre.iloc[:, 0].values  

# Extract the image data (columns 2 to 785) and reshape each row into 28x28
# iloc[Rows , Columns] , ":" means all records , "1:" mean the second column to the last column 
# the data already is 2d array so the "images" will be 3d array 
# as in each index will contain 2d array ( 28 X 28 ) 
# "-1" automatically calculates the number of images based on the total data size. 
# mean will return the number of rows
df_images = df_normalized.iloc[:, 1:].values.reshape(-1, 28, 28)  


print("Images shape:", df_images.shape)

print("-"*80)

# the firts image in 2d array   
print(df_images[0])


>#### Letters Visualization

In [None]:
# we have the unique classes and its counts 
# we need cumulative sum to get index that represents each letter
# Create list from the cumulative sum as the following
# sum(count[:i]) means sum the counts from the first index to the i index
cumulative_counts = [sum(counts[:i]) for i in range(len(counts) + 1)]  


# Create a figure for displaying all letters
plt.figure(figsize=(15, 10))

# Loop through all 26 letters
for i, letter in enumerate(unique_classes):
    # Get the starting index for the current letter
    idx = cumulative_counts[i]

    # Create a grid of 4 rows and 7 columns for visualization 
    plt.subplot(4, 7, i + 1)  
    
    # Display the image using "imshow" that used to display data as an image on a 2D
    plt.imshow(df_images[idx], cmap='gray')  

    # Show the letter as character not as number
    # "ord" get the ASCII representation then add the letter number 
    # then convert to character using casting "chr"
    plt.title(f"Letter: {chr(letter + ord('A'))}") 

    # Hide axes 
    plt.axis("off")  

plt.tight_layout()
plt.show()

>#### Split the data into training and testing datasets

In [None]:

# make the 80% from the data training set and 20% from the data testing set
# random state to ensure that the split return the same data each run


# The final data will be worked on
df_images_train, df_images_test, df_targets_train, df_targets_test = train_test_split(df_images, df_targets, test_size=0.2, random_state=42) 



# Ensure that the training set and testing set contains all the unique classes

targets_train_unique_classes = np.unique(df_targets_train)

print("No. Of Unique classes in targets train : \n" ,len(targets_train_unique_classes) )
print("-"*80)

targets_test_unique_classes = np.unique(df_targets_test)

print("No. Of Unique classes in targets test : \n" ,len(targets_test_unique_classes) )
print("-"*80)



In [513]:
X_train, X_val, y_train, y_val = train_test_split(df_images_train, df_targets_train, test_size=0.2, random_state=42)

In [514]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [515]:
def cost_function(X, y, theta):
    m = len(y)
    h = sigmoid(X @ theta)
    return (-1/m) * (y.T @ np.log(h) + (1 - y).T @ np.log(1 - h))

In [516]:
def gradient_ascent(X, y, theta, learning_rate, iterations, X_val=None, y_val=None):
    m = len(y)
    costs = []
    val_costs = []
    train_accuracies = []
    val_accuracies = []

    for i in range(iterations):
        h = sigmoid(X @ theta)
        gradient = (1 / m) * (X.T @ (y - h))  
        theta += learning_rate * gradient
        cost = cost_function(X, y, theta)
        costs.append(cost)
        
        if i % 20 == 0:
            # Calculate training accuracy
            train_pred = (h >= 0.5).astype(int)
            train_accuracy = np.mean(train_pred == y)
            train_accuracies.append(train_accuracy)

        # Track validation cost (if provided)
        if X_val is not None and y_val is not None:
            val_cost = cost_function(X_val, y_val, theta)
            val_costs.append(val_cost)
            if i % 20 == 0:
                val_h = sigmoid(X_val @ theta)
                val_pred = (val_h >= 0.5).astype(int)
                val_accuracy = np.mean(val_pred == y_val)
                val_accuracies.append(val_accuracy)

    return theta, costs, val_costs, train_accuracies, val_accuracies

In [517]:
def one_vs_all(X, y, num_classes, learning_rate=0.01, iterations=1000, X_val=None, y_val=None):
    m, n = X.shape
    all_theta = np.zeros((num_classes, n))
    all_costs = []
    all_val_costs = []
    all_train_accuracies = []
    all_val_accuracies = []

    for c in range(num_classes):
        print(f"Training classifier for class {c}...")
        theta = np.zeros(n)
        y_binary = (y == c).astype(int)
        theta, costs, val_costs, train_accuracies, val_accuracies = gradient_ascent(X, y_binary, theta, learning_rate, iterations, X_val, y_val)
        all_theta[c] = theta
        all_costs.append(costs)
        all_val_costs.append(val_costs)
        all_train_accuracies.append(train_accuracies)
        all_val_accuracies.append(val_accuracies)

    return all_theta, all_costs, all_val_costs, all_train_accuracies, all_val_accuracies

In [518]:
def predict(X, all_theta):
    probabilities = sigmoid(X @ all_theta.T)
    return np.argmax(probabilities, axis=1)

In [None]:
# Flatten images for logistic regression and add intercept term
X_train = X_train.reshape(X_train.shape[0], -1)
X_val = X_val.reshape(X_val.shape[0], -1)
X_test = df_images_test.reshape(df_images_test.shape[0], -1)

# Add intercept term (bias)
X_train = np.c_[np.ones(X_train.shape[0]), X_train]
X_test = np.c_[np.ones(X_test.shape[0]), X_test]
X_val = np.c_[np.ones(X_val.shape[0]), X_val]

# Train the model
num_classes = len(unique_classes)
learning_rate = 0.1
iterations = 1000

print(len(X_train))
print(len(y_train))


all_theta, all_costs, val_costs, all_train_accuracies, all_val_accuracies = one_vs_all(X_train, y_train, num_classes, learning_rate, iterations, X_val, y_val)

In [None]:
#plot figures for the cost function and validation cost
plt.figure(figsize=(10, 6))
for c in range(num_classes):
    plt.plot(all_costs[c], label=f"Class {c}")
plt.title("Cost Function")
plt.xlabel("Iterations")
plt.ylabel("Cost")
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
for c in range(num_classes):
    plt.plot(val_costs[c], label=f"Class {c}")
plt.title("Validation Cost")
plt.xlabel("Iterations")
plt.ylabel("Cost")
plt.legend()

In [None]:
# Plot accuracy curves
plt.figure(figsize=(10, 6))
for c in range(num_classes):
    plt.plot(all_train_accuracies[c], label=f"Class {c}")
plt.title("Training Accuracy")
plt.xlabel("Iterations")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))
for c in range(num_classes):
    plt.plot(all_val_accuracies[c], label=f"Class {c}")
plt.title("Validation Accuracy")
plt.xlabel("Iterations")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
# Predict on training and testing data
y_pred_train = predict(X_train, all_theta)
y_pred_val = predict(X_val, all_theta)
y_pred_test = predict(X_test, all_theta)
print(len(all_theta))

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train)


test_accuracy = accuracy_score(y_val, y_pred_val)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

# Confusion matrix and F1 scores
conf_matrix = confusion_matrix(df_targets_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

f1 = f1_score(df_targets_test, y_pred_test, average='weighted')
print(f"Average F1 Score: {f1:.2f}")

# Visualize confusion matrix
# class_labels to represent the letters from A to Z
import seaborn as sns
class_labels = [chr(i + ord('A')) for i in range(26)]


# Plot the heatmap
plt.figure(figsize=(18, 10))
sns.heatmap(conf_matrix, 
            annot=True,         
            fmt='g',           
            cmap='Blues',       
            xticklabels=class_labels,  
            yticklabels=class_labels)  

plt.ylabel('Actual', fontsize=13)
plt.title('Confusion Matrix', fontsize=17, pad=20)
plt.gca().xaxis.set_label_position('top')  
plt.xlabel('Prediction', fontsize=13)
plt.gca().xaxis.tick_top()                


plt.show()