In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preprocessing

In [None]:
import os
import skimage.io as io
import numpy as np
from skimage.filters import threshold_otsu
from skimage.morphology import erosion, dilation
import cv2

In [None]:
# Path to the folders containing the images
folder_paths = [
    '/content/drive/MyDrive/JPMC_casestudy/samples1-200/',
    '/content/drive/MyDrive/JPMC_casestudy/samples201-400/',
    '/content/drive/MyDrive/JPMC_casestudy/samples401-600/',
    '/content/drive/MyDrive/JPMC_casestudy/samples601-800/',
    '/content/drive/MyDrive/JPMC_casestudy/samples801-1000/'
]

In [None]:
# Preprocessing steps (Iterate over the folders)

for folder_path in folder_paths:
    output_folder = '/content/drive/MyDrive/JPMC_casestudy/processed_images/'

    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(folder_path):
        if filename.endswith('.png'):

            image_path = os.path.join(folder_path, filename)
            image = cv2.imread(image_path, 0)

            # Applying non-local means denoising to the grayscale image
            image_denoised = cv2.fastNlMeansDenoising(image, None, 15, 25, 30)

            # Step 1 (erosion)
            img_eroded = erosion(image_denoised)

            # Step 2 (dilation)
            kernel = np.ones((2, 2), np.uint8)
            img_dilated = dilation(img_eroded, kernel)

            # Step 3 (dilation)
            kernel = np.ones((4, 1), np.uint8)
            img_dilated_2 = dilation(img_dilated, kernel)

            # Normalizing the image values between 0 and 1
            img_normalized = img_dilated_2 / 255.0

            # Step 4 (threshold)
            threshold = threshold_otsu(img_normalized)
            img_binary = (img_normalized > threshold).astype(np.uint8) * 255

            # Saving the processed image in the output folder
            output_path = os.path.join(output_folder, filename)
            cv2.imwrite(output_path, img_binary)

In [None]:
import os
from PIL import Image

def split_images_in_folder(folder_path, rows, columns, delete_original=False, save_folder=None):
    image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]


    for image_file in image_files:

        image_path = os.path.join(folder_path, image_file)
        image = Image.open(image_path)

        # width and height of the image
        width, height = image.size

        # Calculating the width and height of each cropped image
        crop_width = width // columns
        crop_height = height // rows

        if save_folder and not os.path.exists(save_folder):
            os.makedirs(save_folder)


        for row in range(rows):
            for col in range(columns):
                # Calculating the coordinates for cropping the image
                left = col * crop_width
                upper = row * crop_height
                right = left + crop_width
                lower = upper + crop_height

                # Cropping the image
                cropped_image = image.crop((left, upper, right, lower))

                if save_folder:
                  cropped_image.save(os.path.join(save_folder, f"{image_file[col]}_{col}_{image_file}.png"))
                else:
                   cropped_image.save(f"{image_file[col]}_{col}_{image_file}.png")

        # Delete the original image
        if delete_original:
            os.remove(image_path)


folder_path = "/content/drive/MyDrive/JPMC_casestudy/processed_images"
save_folder = "/content/drive/MyDrive/JPMC_casestudy/cropped_image"
split_images_in_folder(folder_path, 1, 5, False, save_folder)

# Random Forest

In [None]:
from skimage.transform import resize
from skimage.feature import hog
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
folder_path = '/content/drive/MyDrive/JPMC_casestudy/cropped_image'

images = []
labels = []

for filename in os.listdir(folder_path):
    if filename.endswith('.png'):

        image_path = os.path.join(folder_path, filename)
        image = io.imread(image_path,)

        # Flattening the image
        image_flattened = image.flatten()

        # Reshape the flattened image to the original shape
        image_reshaped = image_flattened.reshape(image.shape)

        # Extracting HOG features
        features = hog(image_reshaped, orientations=12, pixels_per_cell=(4, 4), cells_per_block=(1, 1))

        # Getting the label from the filename (label is the first character of the filename)
        label = filename[0]

        images.append(features)
        labels.append(label)

In [None]:
# Converting the lists to arrays
X_RF = np.array(images)
y_RF = np.array(labels)

# Splitting the data into training and validation sets
X_train_RF, X_val_RF, y_train_RF, y_val_RF = train_test_split(X_RF, y_RF, test_size=0.2, random_state=100)

In [None]:
# Training a Random Forest classifier
classifier = RandomForestClassifier(n_estimators=200, random_state=100)
classifier.fit(X_train_RF, y_train_RF)

# Predicting the labels for the validation set
y_pred_RF = classifier.predict(X_val_RF)

# Calculating the accuracy of the classifier
accuracy = accuracy_score(y_val_RF, y_pred_RF)
print("Validation Accuracy_RF:", accuracy)

Validation Accuracy_RF: 0.639


# SVM

In [None]:
# Converting the lists to arrays
X_SVM = np.array(images)
y_SVM = np.array(labels)

# Splitting the data into training and validation sets
X_train_SVM, X_val_SVM, y_train_SVM, y_val_SVM = train_test_split(X_SVM, y_SVM, test_size=0.2, random_state=100)

In [None]:
from sklearn.svm import SVC

# Training an SVM classifier
classifier = SVC(kernel='linear', random_state=100)

classifier.fit(X_train_SVM, y_train_SVM)

# Predicting the labels for the validation set
y_pred_SVM = classifier.predict(X_val_SVM)

# Calculating the accuracy of the classifier
accuracy = accuracy_score(y_val_SVM, y_pred_SVM)
print("Validation Accuracy_SVC:", accuracy)

Validation Accuracy_SVC: 0.586


# Logistic Regression

In [None]:
# Converting the lists to arrays
X_LR = np.array(images)
y_LR = np.array(labels)

# Splitting the data into training and validation sets
X_train_LR, X_val_LR, y_train_LR, y_val_LR = train_test_split(X_LR, y_LR, test_size=0.2, random_state=100)

In [None]:
from sklearn.linear_model import LogisticRegression

# Training a Logistic Regression classifier
classifier = LogisticRegression(random_state=100, max_iter=1000)

classifier.fit(X_train_LR, y_train_LR)

# Predicting the labels for the validation set
y_pred_LR = classifier.predict(X_val_LR)

# Calculating the accuracy of the classifier
accuracy = accuracy_score(y_val_LR, y_pred_LR)
print("Validation Accuracy_LR:", accuracy)

Validation Accuracy_LR: 0.563


#Q1. Which of the above classifiers yield the best accuracy on the validation set? Do you think accuracy is the best metric to compare the algorithms, if not - which other metric could be used?

We are getting the Validation Accuracy: 0.659 for Random Forest.

While accuracy is a commonly used metric to evaluate classification models, it may not always be the best metric, especially in cases where the classes are imbalanced or have different levels of importance. In such scenarios, other metrics like precision, recall, and F1-score can provide additional insights.

In the given classification report, we can observe that the precision, recall, and F1-score values vary across different classes. This indicates that the performance of the classifier is not consistent for all classes. Therefore, solely relying on accuracy may not be sufficient to compare the algorithms.

To choose a different metric, since we want to minimize false negatives, recall might be more relevant.

#Q2. Tweak the hyperparameters of the classifier selected in Q1 and try to improve the accuracy. What hyperparameters did you change, why do you think it worked?

# **RF Model**

**earlier** --> classifier = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, random_state=100)

**updated** --> classifier = RandomForestClassifier(n_estimators=300, max_depth=15, min_samples_split=10, random_state=100)

Validation Accuracy changed from 0.597 to 0.659

# **SVM Model**

**earlier** --> classifier = SVC(kernel='linear', random_state=100)


**updated** --> classifier = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=100)

Validation Accuracy changed from 0.586 to 0.65

# **LR Model**

**earlier** --> classifier = LogisticRegression(random_state=100, max_iter=1000)

**updated** --> classifier = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', random_state=100, max_iter=1000)

Validation Accuracy changed from 0.563 to 0.553

#Q3. Find Precision, Recall and F1 score in all cases.

In [None]:
# Calculating the classification report
report = classification_report(y_val_RF, y_pred_RF, digits=3)

print("Classification Report_RF:")
print(report)

Classification Report_RF:
              precision    recall  f1-score   support

           a      0.851     0.769     0.808        52
           b      0.520     0.788     0.627        33
           c      0.605     0.622     0.613        37
           d      0.643     0.643     0.643        28
           e      0.659     0.674     0.667        43
           f      0.600     0.525     0.560        40
           g      0.833     0.690     0.755        29
           h      0.667     0.343     0.453        35
           i      0.559     0.594     0.576        32
           j      0.527     0.929     0.672        42
           k      0.643     0.562     0.600        32
           l      0.447     0.553     0.494        38
           m      0.632     0.500     0.558        48
           n      0.600     0.628     0.614        43
           o      0.744     0.707     0.725        41
           p      0.761     0.814     0.787        43
           q      0.742     0.767     0.754        30
 

In [None]:
# Calculating the classification report
report = classification_report(y_val_SVM, y_pred_SVM, digits=3)

print("Classification Report_SVM:")
print(report)

Classification Report_SVM:
              precision    recall  f1-score   support

           a      0.850     0.654     0.739        52
           b      0.537     0.667     0.595        33
           c      0.469     0.622     0.535        37
           d      0.487     0.679     0.567        28
           e      0.683     0.651     0.667        43
           f      0.467     0.525     0.494        40
           g      0.550     0.759     0.638        29
           h      0.545     0.514     0.529        35
           i      0.519     0.438     0.475        32
           j      0.644     0.690     0.667        42
           k      0.487     0.594     0.535        32
           l      0.477     0.553     0.512        38
           m      0.429     0.375     0.400        48
           n      0.472     0.581     0.521        43
           o      0.600     0.512     0.553        41
           p      0.789     0.698     0.741        43
           q      0.590     0.767     0.667        30


In [None]:
# Calculating the classification report
report = classification_report(y_val_LR, y_pred_LR, digits=3)

print("Classification Report_LR:")
print(report)

Classification Report_LR:
              precision    recall  f1-score   support

           a      0.795     0.596     0.681        52
           b      0.600     0.636     0.618        33
           c      0.486     0.486     0.486        37
           d      0.500     0.500     0.500        28
           e      0.658     0.581     0.617        43
           f      0.368     0.350     0.359        40
           g      0.750     0.724     0.737        29
           h      0.472     0.486     0.479        35
           i      0.714     0.469     0.566        32
           j      0.732     0.714     0.723        42
           k      0.568     0.656     0.609        32
           l      0.478     0.579     0.524        38
           m      0.418     0.479     0.447        48
           n      0.559     0.442     0.494        43
           o      0.619     0.634     0.627        41
           p      0.700     0.651     0.675        43
           q      0.564     0.733     0.638        30
 

# Q4. a. Find the confusion matrix using the best classifier

In [None]:
from sklearn.metrics import classification_report
print(metrics.confusion_matrix(y_val_RF, y_pred_RF))

[[40  0  2  0  1  0  1  0  0  0  1  1  0  0  0  0  1  0  0  1  0  0  1  0
   0  3]
 [ 0 26  0  0  0  1  0  3  1  1  0  0  0  0  0  0  0  0  0  0  0  0  1  0
   0  0]
 [ 0  0 23  0  3  0  0  0  0  0  0  1  0  0  0  1  0  4  0  1  0  0  0  2
   0  2]
 [ 1  3  0 18  0  0  0  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  4]
 [ 1  0  5  0 29  0  0  0  0  0  0  0  1  1  2  0  1  0  0  0  0  0  1  0
   0  2]
 [ 0  0  1  2  2 21  0  0  1  2  0  4  0  0  0  1  0  0  1  0  0  1  0  1
   0  3]
 [ 1  0  0  0  0  0 20  0  0  3  1  0  1  0  0  2  0  0  1  0  0  0  0  0
   0  0]
 [ 0 14  0  2  0  0  0 12  0  2  0  1  0  0  0  0  1  0  2  0  0  0  1  0
   0  0]
 [ 0  2  0  0  0  1  0  0 19  1  0  3  0  1  1  0  0  0  1  1  0  0  0  0
   0  2]
 [ 1  0  0  0  0  0  0  0  2 39  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]
 [ 0  0  1  0  2  1  2  0  2  1 18  0  0  0  0  0  0  0  0  0  0  2  0  1
   1  1]
 [ 1  0  0  1  0  0  0  0  0  3  0 21  0  0  0  0  0  0  1  0  0  0  0  2
   2  7]
 [ 1

# Q4. b. Which characters have low accuracy?

In [None]:
data = [line.split() for line in report.split('\n')[2:-5]]
columns = ['class', 'precision', 'recall', 'f1-score', 'support']
df = pd.DataFrame(data, columns=columns)

# Calculating accuracy for each class
accuracies = [float(line.split()[1]) for line in report.split('\n')[2:-5]]

df['accuracy'] = accuracies

print("Classification Report_RF :")
sorted_df = df.sort_values(by='accuracy', ascending=False)
sorted_df.tail()

Classification Report_RF :


Unnamed: 0,class,precision,recall,f1-score,support,accuracy
9,j,0.527,0.929,0.672,42,0.527
1,b,0.52,0.788,0.627,33,0.52
23,x,0.515,0.436,0.472,39,0.515
11,l,0.447,0.553,0.494,38,0.447
25,z,0.446,0.694,0.543,36,0.446


# Q4. c. What can be the reason for this?

Inherent similarities between characters: Some characters like 'f', 't', 'l' have visual similarities or share common features that make it challenging for the model to distinguish between them.

Limited feature representation: The features extracted from the images of characters like 'm' may not adequately capture the discriminative information for certain characters.

# Q5. Try any other techniques/algorithms in your research that could improve the accuracy.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Converting the lists to arrays
X_KNN = np.array(images)
y_KNN = np.array(labels)

# Splitting the data into training and validation sets
X_train_KNN, X_val_KNN, y_train_KNN, y_val_KNN = train_test_split(X_KNN, y_KNN, test_size = 0.2, random_state = 100)

# Training a KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors = 10)
knn_classifier.fit(X_train_KNN, y_train_KNN)

y_pred_KNN = knn_classifier.predict(X_val_KNN)

# Calculating the accuracy
accuracy_KNN = accuracy_score(y_val_KNN, y_pred_KNN)
print("Validation Accuracy_KNN:", accuracy_KNN)

Validation Accuracy_KNN: 0.53
