In [133]:
import cv2 as cv
from glob import glob
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from mpl_toolkits.axes_grid1 import ImageGrid
import math
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sn
from sklearn.model_selection import train_test_split


# initialize data frames for import
asl_alpha_df = pd.DataFrame({"images":[], "label":[]})
asl_digits_df = pd.DataFrame({"images":[], "label":[]})
lg_df = pd.DataFrame({"images":[], "label":[]})

# import paths
asl_path = '../raw data/asl_dataset/'
lg_path = '../raw data/asl_lg/'
asl_folders = glob(asl_path+'/*')
lg_folders = glob(lg_path+'/*')

# class set variations
letter_classes = [i for i in "ABCDEFGHIJKLMNOPQRSTUVWXYZ"]
all_classes = [i for i in "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"]

# import asl data

def import_data(df, folders, isDigits):
    for path in folders:
        img_path = glob(path+'/*')
        _,label = os.path.split(path)
        files = os.listdir(path)
        for filename in files:
            im = cv.imread(path+'/'+filename, 64).flatten() # 64 is for IMREAD_REDUCED_GRAYSCALE_8 (grayscale, 1/8th)
            if (path[-1].isdigit() == isDigits):
                data = pd.DataFrame({"images":[im], "label":[label]})
                df = df.append(data, ignore_index = True)
    return df


In [134]:
asl_alpha_df = import_data(asl_alpha_df, asl_folders, False)
asl_alpha_df.to_csv('./dataframes/asl_alpha_df.csv')

asl_digits_df = import_data(asl_digits_df, asl_folders, True)
asl_digits_df.to_csv('./dataframes/asl_digits_df.csv')

lg_df = import_data(lg_df, lg_folders, False)
lg_df.to_csv('./dataframes/lg_df.csv')

asl_alpha_digits_df = asl_digits_df.append(asl_alpha_df, ignore_index = True)
asl_alpha_digits_df.to_csv('./dataframes/asl_alpha_digits_df.csv')

asl_alpha_lg_df = asl_alpha_df.append(lg_df, ignore_index = True)
asl_alpha_lg_df.to_csv('./dataframes/asl_alpha_lg_df.csv')

def save_data(arr, prefix):
    np.savetxt('./train_test/{}/X_train.csv'.format(prefix), arr[0], delimiter=',')
    np.savetxt('./train_test/{}/X_test.csv'.format(prefix), arr[1], delimiter=',')
    np.savetxt('./train_test/{}/y_train.csv'.format(prefix), arr[2], delimiter=',', fmt='%s')
    np.savetxt('./train_test/{}/y_test.csv'.format(prefix), arr[3], delimiter=',', fmt='%s')
    print('saved data in ./train_test/{}'.format(prefix))

# Base dataset - test split on alpha and digits asl data
X = np.vstack(asl_alpha_digits_df['images'])
y = asl_alpha_digits_df['label']
X_train_ad, X_test_ad, y_train_ad, y_test_ad = train_test_split(X, y, test_size=0.25, random_state=0)
save_data([X_train_ad, X_test_ad, y_train_ad, y_test_ad], 'base')


# Combined dataset - test split on alpha asl + lg
X_alpha_lg = np.vstack(asl_alpha_lg_df['images'])
y_alpha_lg = asl_alpha_lg_df['label']
X_train_alpha_lg, X_test_alpha_lg, y_train_alpha_lg, y_test_alpha_lg = train_test_split(X_alpha_lg, y_alpha_lg, test_size=0.25, random_state=0)
save_data([X_train_alpha_lg, X_test_alpha_lg, y_train_alpha_lg, y_test_alpha_lg], 'combo')

# Alpha train - lg test
X_alpha_train = np.vstack(asl_alpha_df['images'])
y_alpha_train = asl_alpha_df['label']
X_lg_test = np.vstack(lg_df['images'])
y_lg_test = lg_df['label']
save_data([X_alpha_train, X_lg_test, y_alpha_train, y_lg_test], 'alphalg')

saved data in ./train_test/base
saved data in ./train_test/combo
saved data in ./train_test/alphalg


In [136]:
gridsize = 5
numfigs = gridsize**2
figsize = (10, 10)
cols = gridsize
rows = gridsize

def trim(axs, N):
    axs = axs.flat
    n = min(N, len(axs))
    for ax in axs[n:]:
        ax.remove()
    return axs[:n]

def printFigs(X, y, path):
    plot, axs = plt.subplots(rows, cols, figsize=figsize)
    axs = trim(axs, numfigs)
    for ax, fig in zip(axs, range(0, (y.size - 1), math.floor((y.size - 1)/numfigs))):
        sign = X[fig]
        sign_pixels = sign.reshape(50, 50)
        ax.imshow(sign_pixels)
        ax.axis('off')
        ax.set_title('label = {}'.format(y[fig]))
    plt.savefig(path)
    plt.close()
    print('fig saved under {}'.format(path))

printFigs(X, y, './previews/asl.jpg')
printFigs(X_lg_test, y_lg_test, './previews/lg.jpg')
        

fig saved under ./previews/asl.jpg
fig saved under ./previews/lg.jpg


In [137]:
scores = {}

def generate_heatmap(logisticRegr, classes, prefix):
    scale = np.max(np.abs(logisticRegr.coef_))
    p = plt.figure(figsize=(15,15));
    grid = ImageGrid(p, 111,  # similar to subplot(111)
                     nrows_ncols=(6, 6),
                     axes_pad=0.25,
                    )
    nclasses = len(classes)
    for ax, im in zip(grid, [i for i in range(nclasses)]):
        ax.imshow(logisticRegr.coef_[im].reshape(50, 50),
                      cmap=plt.cm.RdBu, vmin=-scale, vmax=scale);
        ax.axis('off')
        ax.set_title(classes[im])
    plt.savefig('./heatmaps/{}.png'.format(prefix))
    plt.close()

def generate_classification_report(y_test, predictions, prefix):
    print(
        f"Classification report for {prefix}:\n"
        f"{classification_report(y_test, predictions)}\n"
    )

def generate_confusion_matrix(y_test, predictions, classes, prefix):
    array = confusion_matrix(y_test, predictions)
    df_cm = pd.DataFrame(array, index = classes,
                  columns = classes)
    plt.figure(figsize = (10,7))
    sn.heatmap(df_cm, annot=True)
    plt.savefig('./cmatrix/{}.png'.format(prefix))
    plt.close()
    
def generate_heatmap_overlay(misclassifiedIndexes, test_lbl, test_img, predictions, logisticRegr, classes, prefix):
    index = misclassifiedIndexes[10]
    actual = test_lbl[index]
    pred = predictions[index]
    img_arr = test_img[index]
    img = img_arr.reshape(50, 50)

    mask = logisticRegr.coef_[classes.index(actual.upper())].reshape(50, 50);
    plt.imshow(img)
    plt.imshow(mask, alpha=0.5, cmap=plt.cm.RdBu) #alpha sets transparency, cmap can choose color
    plt.savefig('./heatmaps/overlay_{}_actual.png'.format(prefix))
    plt.close()
    
    mask_2 = logisticRegr.coef_[classes.index(pred.upper())].reshape(50, 50);
    plt.imshow(img)
    plt.imshow(mask_2, alpha=0.5, cmap=plt.cm.RdBu) #alpha sets transparency, cmap can choose color
    plt.savefig('./heatmaps/overlay_{}_pred.png'.format(prefix))
    plt.close()
    
def generate_misclassified_matrix(X_test, y_test, predictions, logisticRegr, classes, prefix):
    index = 0
    misclassifiedIndexes = []
    for label, predict in zip(y_test, predictions):
        if label != predict:
            misclassifiedIndexes.append(index)
        index +=1
        
    missedCount = len(misclassifiedIndexes)
    print('Count of misclassified: {}, total test imgs: {}'.format(missedCount, len(predictions)))
    print('Percent misclassified: {}%'.format(missedCount / len(predictions) * 100))
    print('\nFirst 25 misclassifications:')

    test_img = np.array(X_test)
    test_lbl = np.array(y_test)

    plot, axs = plt.subplots(rows, cols, figsize=figsize)
    axs = trim(axs, numfigs)
    for ax, badindex in zip(axs, misclassifiedIndexes[0:25]):
        digit = test_img[badindex]
        digit_pixels = digit.reshape(50, 50)
        ax.imshow(digit_pixels)
        ax.axis('off')
        ax.set_title('pre: {}, act: {}'.format(predictions[badindex], test_lbl[badindex]), fontsize = 15)
    plt.savefig('./misclassified/{}.png'.format(prefix))
    plt.close()
    generate_heatmap_overlay(misclassifiedIndexes, test_lbl, test_img, predictions, logisticRegr, classes, prefix)

def run_regression(X, X_test, y, y_test, classes, prefix):
    logisticRegr = LogisticRegression(C=50/len(classes), penalty='l1', solver='saga', tol=0.1)
    logisticRegr.fit(X, y)
    predictions = logisticRegr.predict(X_test)
    # show score to confirm success 
    score = logisticRegr.score(X_test, y_test)
    scores[prefix] = score
    print(prefix, score)
    generate_heatmap(logisticRegr, classes, prefix)
    generate_classification_report(y_test, predictions, prefix)
    generate_confusion_matrix(y_test, predictions, classes, prefix)
    generate_misclassified_matrix(X_test, y_test, predictions, logisticRegr, classes, prefix)

In [138]:
run_regression(X_train_ad, X_test_ad, y_train_ad, y_test_ad, all_classes, 'base')

base 0.8934817170111288
Classification report for base:
              precision    recall  f1-score   support

           0       0.69      0.85      0.76        13
           1       1.00      0.93      0.97        15
           2       0.83      0.79      0.81        19
           3       0.92      1.00      0.96        11
           4       0.95      0.83      0.88        23
           5       0.89      0.94      0.92        18
           6       0.43      0.64      0.51        14
           7       1.00      0.76      0.87        17
           8       0.83      1.00      0.91        15
           9       1.00      1.00      1.00        22
           a       1.00      1.00      1.00        13
           b       0.80      0.92      0.86        13
           c       1.00      1.00      1.00        19
           d       0.91      0.91      0.91        22
           e       1.00      0.95      0.98        22
           f       0.94      1.00      0.97        17
           g       1.00  

In [139]:
run_regression(X_train_alpha_lg, X_test_alpha_lg, y_train_alpha_lg, y_test_alpha_lg, letter_classes, 'combo')

combo 0.9154013015184381
Classification report for combo:
              precision    recall  f1-score   support

           a       1.00      0.82      0.90        22
           b       0.86      1.00      0.92        12
           c       1.00      0.88      0.94        26
           d       0.95      0.95      0.95        20
           e       0.96      0.96      0.96        27
           f       1.00      0.95      0.97        19
           g       0.88      0.93      0.90        15
           h       1.00      0.82      0.90        11
           i       0.96      1.00      0.98        22
           j       0.94      0.94      0.94        18
           k       0.94      0.94      0.94        18
           l       0.95      0.95      0.95        20
           m       0.62      1.00      0.76         8
           n       1.00      0.68      0.81        22
           o       0.86      1.00      0.92        18
           p       0.95      1.00      0.97        19
           q       1.00

In [140]:
run_regression(X_alpha_train, X_lg_test, y_alpha_train, y_lg_test, letter_classes, 'lg')

lg 0.19230769230769232


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report for lg:
              precision    recall  f1-score   support

           a       0.00      0.00      0.00         1
           b       0.00      0.00      0.00         1
           c       0.00      0.00      0.00         1
           d       0.00      0.00      0.00         1
           e       0.00      0.00      0.00         1
           f       0.00      0.00      0.00         1
           g       0.00      0.00      0.00         1
           h       0.00      0.00      0.00         1
           i       0.00      0.00      0.00         1
           j       0.00      0.00      0.00         1
           k       0.00      0.00      0.00         1
           l       0.50      1.00      0.67         1
           m       0.00      0.00      0.00         1
           n       0.00      0.00      0.00         1
           o       0.00      0.00      0.00         1
           p       0.00      0.00      0.00         1
           q       0.00      0.00      0.00        

In [141]:
print(scores)

{'base': 0.8934817170111288, 'combo': 0.9154013015184381, 'lg': 0.19230769230769232}
