In [34]:
from PIL import Image
import numpy as np

First load train images: 

In [35]:
with open("task/train.txt", "r") as f:
    train_imgs = [np.array(Image.open(f"images/{l.strip()}.jpg")).reshape(1, -1) for l in f]

Next use Otsu method to binarize all training images:

In [36]:
def binarize(i, t):
    """Binarize an image (as numpy array) using the given threshold."""
    thresholded_im = np.zeros(i.shape)
    thresholded_im[i >= t] = 1
    return thresholded_im


def otsu_threshold_value(i, t):
    """
    This is essentially copied from https://en.wikipedia.org/wiki/Otsu's_method#Python_implementation

    Args:
        i (np.array): the image that should be binarized
        t (int): threshold to test

    Returns:
        (int) intra-class variance of black and white pixels
    """
    thresholded_im = binarize(i, t)

    nb_pixels = len(i)
    nb_pixels1 = np.count_nonzero(thresholded_im)
    weight1 = nb_pixels1 / nb_pixels
    weight0 = 1 - weight1

    if weight1 == 0 or weight0 == 0:
        return 10000

    val_pixels1 = i[thresholded_im == 1]
    val_pixels0 = i[thresholded_im == 0]

    var0 = np.var(val_pixels0) if len(val_pixels0) > 0 else 0
    var1 = np.var(val_pixels1) if len(val_pixels1) > 0 else 0

    return weight0 * var0 + weight1 * var1


def best_threshold(i):
    thresholds = [(i * 0xff)/20 for i in range(20)]
    return thresholds[np.argmax([otsu_threshold_value(i, t) for t in thresholds])]


binarized_train_imgs = [binarize(i, best_threshold(i)) for i in train_imgs]