In [1]:
from PIL import Image
import os
import glob
import math
import shutil
import numpy as np
import imageio
import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

Using TensorFlow backend.


In [2]:
source_path = './src_captchas/'
whole_captchas_path = './processed_captchas/whole_captchas/'
chars_path = './processed_captchas/chars/'
filetype = '*.jpg'

## Captcha Solving using a CNN
![](./src_captchas/HvJC56.jpg)
Writing / using a CNN is probably overkill esp. for a captcha like this

### Motivation
* [Slacknotes](https://slacknotes.com/) is a website visited by thousands of UBC students for getting into classes but also for viewing the grades analyzer. Retrieving grades from UBC Pair is cumbersome. It's interesting to note that one of my classes last semester, CPSC 310, encountered the same problem, and ended up just scraping the grades from Slacknotes. Slacknotes used to use external human captcha solvers via an api who will manually input them, but that costed money and was exceptionally slow.

### Decisions made on the preprocessing of the data
* Divide the captchas into 6 separate pieces, rather than training on the entire captcha
    * Keep it as a categorization problem
* Have to ensure that there are at least n captchas that cover all possible chars on the captchas. I hand-labelled ~90 captchas just in case and are located [in the src_captchas folder](./src_captchas)
* CNNs aren't rotation invariant. Add rotations. I rotated the sliced character images I got and rotated it -25 to 25 degrees (step size of 1 for optimal accuracy, but for the sake of speed and showing mistakes, I made the step size 5). Characters didn't really rotate much past it. 
    * Ex.
        * Original cropped image ![](./readme_examples/3wEYFG-3-4.jpg)
        * Min and max rotations ![](./readme_examples/3wEYFG-3-0.jpg) ![](./readme_examples/3wEYFG-3-9.jpg)
* crop the base image. There's no data of value on most of the image.
    * Ex.
        * Original ![](./src_captchas/3wEYFG.jpg)
        * Cropped ![](./readme_examples/3wEYFG.jpg)

### Things to note
* Prefer to have at least 99.5 percent accuracy
    * 98% accuracy on 1 character means worst case ~ 88% accuracy on 6.
    * 96% accuracy on 1 character means worst case ~78% accuracy on 6.
    
### Things I learned
* The most interesting part to me really was the processing of the data and making decisions on what my input should look like to get the best result. In class, all the data comes perfectly processed, so it was really interesting for me to try to figure out how my data should look like. Should I have put the entire captcha and make the neural network figure it out? Should I split it? Should I delete extraneous parts of the image? If I do split up all the letters, that would mean I'll have to write extra methods to make predictions of entire captchas, calculate loss, make sure that my end-to-end predictor also splits up live captchas, etc. etc. 
* I can't write a better blog post on how a CNN works than the millions that are already out there. This was a fun exercise though, I got to play around with a lot of things, but ultimately kept it as simple as possible because I didn't see a major difference if I made it much more complicated (other than it taking way longer)

In [3]:
class PreProcessor:
    def __init__(self, source_path, whole_captchas_path, chars_path, filetype):
        self.source_path = source_path
        self.whole_captchas_path = whole_captchas_path
        self.chars_path = chars_path
        self.file_list = glob.glob(os.path.join(source_path, filetype))
        self.char_img_width = 23
        self.char_img_height = 40
        
    def preprocess(self):
        self._reset()
        filelist = self.file_list

        for index, filepath in enumerate(filelist):
            self.process_helper(index, filepath)
            
    def process_helper(self, index, filepath):
        filename = self._get_filename(filepath)
        image_processor = ImageProcessor(filepath, trim_width=6, trim_height=20)
        cropped_base_img = image_processor.get_cropped_base_img()
        chars_of_captcha = list(filename)
        self._save_whole_captcha_img(filename, cropped_base_img)

        try:
            divided_imgs = image_processor.divide()
            start_num = 0
            for index_in_captcha, piece in enumerate(divided_imgs, start_num):
                current_char = self._format_if_upper(chars_of_captcha[index_in_captcha])
                char_img = image_processor.get_char_img(piece)
                
                self._mkdir(self.chars_path + current_char)
                self._save_char_img_within_whole_captcha(filename, current_char, char_img)
                rotated_imgs = image_processor.rotate_divided_imgs(char_img)
                for j, rotated_img in enumerate(rotated_imgs):
                    self._save_rotated_img(rotated_img, current_char, filename, index_in_captcha, j)
                    
        except Exception as e:
            print(e)
    
    # Process images into a matrix. X is a w x h x 3 matrix which represents every single
    # pixel in the image. y represents the image's character
    def process_char_imgs_into_matrix(self):
        dir_chars = glob.glob(os.path.join(self.chars_path, '*'))
        char_index_dict = {k: v for v, k in enumerate(os.listdir(self.chars_path)[:])}

        n = self._get_num_files(self.chars_path)
        X = np.empty((n, self.char_img_width, self.char_img_height, 3), dtype='uint8')
        y = np.empty((n,1), dtype='uint8')
        
        count = 0
        i = 0
        
        for dir_char in dir_chars:
            char = self._get_filename(dir_char)
            filelist = glob.glob(os.path.join(self.chars_path + char,"*.jpg"))
            count += len(filelist)
            for filenum, file in enumerate(filelist):
                char_img = imageio.imread(file).reshape(self.char_img_width, self.char_img_height, 3)
                X[i] = char_img
                y[i] = char_index_dict[char]
                i += 1
                
        # Turn y into categorical classes
        d = len(dir_chars)
        y = keras.utils.to_categorical(y, d)

        return X,y
    
        
    # Save rotated image within character folder
    def _save_rotated_img(self, rotated_img, current_char, filename, index_in_captcha, j):
        path = os.path.join(chars_path + "/%s/%s-%d-%d.jpg" % (current_char, filename, index_in_captcha, j))
        rotated_img.save(path)
    
    # Save the whole captcha image in the whole captchas folder
    def _save_whole_captcha_img(self, filename, cropped_base_img):
        whole_captcha_path = os.path.join(self.whole_captchas_path + filename)
        self._mkdir(whole_captcha_path)
        whole_captcha_img_path = os.path.join(whole_captcha_path + "/%s" % (filename + ".jpg"))
        cropped_base_img.save(whole_captcha_img_path)
    
    # Save individual character image within whole captcha path
    def _save_char_img_within_whole_captcha(self, filename, current_char, char_img):
        character_within_whole_captcha_path = os.path.join(self.whole_captchas_path + "%s/%s.jpg" % (filename, current_char))
        char_img.save(character_within_whole_captcha_path)

    # Delete all files in the destination paths
    def _reset(self):
        shutil.rmtree(self.chars_path)
        shutil.rmtree(self.whole_captchas_path)
        os.mkdir(self.chars_path)
        os.mkdir(self.whole_captchas_path)
    
    # Create a directory if it does not exists
    def _mkdir(self, path):
        if not os.path.exists(path):
            os.mkdir(path)
    
    # Get only the filename given a path
    def _get_filename(self, path):
        return path.rsplit("/",1)[-1].split(".")[0]
    
    # Get the number of files within the directory given
    def _get_num_files(self, path):
        total = 0
        for root, dirs, files in os.walk(path):
            total += len(files)
        return total
    
    def _format_if_upper(self, char):
        if char.isupper():
            return "cap-" + char
        return char


In [4]:
class ImageProcessor:
    def __init__(self, path, trim_width, trim_height):
        self.img = Image.open(path)
        self.width = (self.img.size[0] - trim_width * 2) // trim_width
        self.height = (self.img.size[1] - trim_height * 2)
        self.trim_width = trim_width
        self.trim_height = trim_height
        
    def get_char_width(self):
        return self.width
    
    def get_char_height(self):
        return self.height
        
    # Get entire captcha
    def get_char_img(self, char_piece):
        img = Image.new('RGB', (self.width, self.height), 255)
        img.paste(char_piece)
        return img
        
    # Rotate images to provide more data
    def rotate_divided_imgs(self, char_img):
        #rotation step-size is very large (5)
        for angle in range(-25, 25, 5):
            yield self.get_rotated_img(char_img, angle)
            
    # Given an angle, rotate image by that much. Must do this or the trained model won't train on rotations well.
    def get_rotated_img(self, char_img, angle):
        new_img = Image.new("RGBA", (int(self.width),int(self.height)), "red")
        background_red_img = Image.new("RGBA", (self.width,self.height), "red")
        src_img = char_img.convert('RGBA')
        rotated_img = src_img.rotate(angle)
        new_img.paste( rotated_img, (0,0), rotated_img)
        new_img = new_img.convert("RGB")
        return new_img
    
    # Crop original image to not cause divisions that trim too much
    def get_cropped_base_img(self):
        width, height = self.img.size
        box = (self.trim_width, self.trim_height, width - self.trim_width, height - self.trim_height)
        cropped_img = self.img.crop(box)
        return cropped_img
    
    # Divide Image into 6 parts
    def divide(self):
        cropped_img = self.get_cropped_base_img()
        imgwidth, imgheight = cropped_img.size
        for i in range(int(imgheight)//int(self.height)):
            for j in range(int(imgwidth)//int(self.width)):
                box = (j*self.width, i*self.height, (j+1)*self.width, (i+1)*self.height)
                yield cropped_img.crop(box)


## Convolutional Neural Network

In [5]:
#super simple cnn no fancy schmancy anything
class ConvolutionalNeuralNetwork:
    def __init__(self, chars_path, batch_size, epochs):
        self.chars_path = chars_path
        self.batch_size = batch_size
        self.epochs = epochs
        self.num_classes = len(glob.glob(os.path.join(chars_path, '*')))
        self.model = Sequential()
        self.mistakes = None
        self.history = None
    
    def fit(self, Xtrain, ytrain):
        Xtrain = Xtrain.astype('float32') #tf requires float32, no uint no float64
        Xtrain /= 255 
        
        model = self.model
        
        model.add(Conv2D(32, (3, 3), padding='same', input_shape=Xtrain.shape[1:]))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.5))

        model.add(Flatten())
        model.add(Dense(self.num_classes))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

        history = model.fit(Xtrain, ytrain,validation_split=0.33, batch_size=self.batch_size, epochs=self.epochs, verbose=1)
        self.history = history
        self.model = model
        
    def get_history(self):
        if self.history is None:
            raise Exception('Must run the fit function before getting the history!')
            
        return self.history

        
    def get_model(self):
        if self.model is None:
            raise Exception('Must run the fit function before getting the history!')
            
        return self.model
        
    def get_summary(self):
        return self.model.summary()
    
    def predict(self, Xtest, ytest):
        Xtest = Xtest.astype('float32')
        Xtest /= 255
        scores = self.model.evaluate(Xtest, ytest, verbose=1)
        print('Test loss :', scores[0])
        print('Test accuracy:', scores[1])
    
    def predict_all_whole_captchas(self, src_captchas_path):
        dir_whole_captchas = os.listdir(src_captchas_path)[1:]
        char_val_dict = {k: v for v, k in enumerate(os.listdir(self.chars_path)[:])}
        val_chars_dict = {v: k for v, k in enumerate(list(filter(lambda x:   x.endswith(".DS_Store") == False, os.listdir(self.chars_path)[:])))}
        
        n = len(dir_whole_captchas)
        y = np.zeros((n,6,1), dtype='uint8')
        y_preds = np.zeros((n,6,1), dtype='uint8')
        
        for i, filename in enumerate(dir_whole_captchas):
            path = os.path.join(os.path.join(src_captchas_path, filename))
            image_processor = ImageProcessor(path, 6, 20)
            char_list = list(filename)
            divided_imgs = image_processor.divide()
            preds = []
            
            for k, piece in enumerate(divided_imgs):
                current_char = self._format_if_upper(char_list[k])
                
                img = image_processor.get_char_img(piece)
                
                X = np.empty((1, 23, 40,3), dtype='uint8')
                
                y[i][k] = char_val_dict[current_char]
                
                X[0] = np.array(img).reshape(23,40,3)

                probs_pred = self.model.predict(X)
                pred = val_chars_dict[np.argmax(probs_pred)]
                preds.append(pred[-1])
                
                y_preds[i][k] = char_val_dict[pred]
                
        accuracy = (y == y_preds).all(-2).sum() / len(y)
        loss = 1 - accuracy

        not_equal = (y != y_preds).all(2)
        
        wrong_preds = y_preds[np.where(not_equal)[0]]
        actual = y[np.where(not_equal)[0]]
        self._store_mistakes_from_whole_captchas(actual, wrong_preds)
        
        print('Whole Captcha Test loss:', loss)
        print('Whole Captcha Test accuracy:', accuracy)
        return accuracy, loss
    
    def make_prediction_from_whole_captchas(self, path):
        val_chars_dict = {v: k for v, k in enumerate(list(filter(lambda x:   x.endswith(".DS_Store") == False, os.listdir(self.chars_path)[:])))}
        image_processor = ImageProcessor(path, 6, 20)
        divided_imgs = image_processor.divide()
        preds = []
        for k,piece in enumerate(divided_imgs):
            img = image_processor.get_char_img(piece)
            
            X = np.empty((1, 23, 40,3), dtype='uint8')
            X[0] = np.array(img).reshape(23,40,3)

            probs_pred = self.model.predict(X)
            pred = val_chars_dict[np.argmax(probs_pred)]
            preds.append(pred[-1])
        return "".join(preds)
    
    def get_mistakes(self):
        if self.mistakes is None:
            raise Exception('Must run the predict function first before finding out which predictions you got wrong!')

        self._transform_mistakes()
    
    def _transform_mistakes(self):
        actual, wrong_preds = self.mistakes
        char_val_dict = {k: v for v, k in enumerate(os.listdir(self.chars_path)[:])}
        keys = char_val_dict.keys()
        list_keys = list(keys)
        for key in keys:
            if "cap" in key:
                j = char_val_dict[key]
                list_keys[j] = key[-1]
        keys = np.array(list_keys)
        
        for i in range(len(wrong_preds)):
            flattened_pred = "".join(keys[np.ndarray.flatten(wrong_preds[i])])
            flattened_actual = "".join(keys[np.ndarray.flatten(actual[i])])
            print("Actual: %s   -   Pred: %s" % (flattened_actual, flattened_pred))
    
    def _store_mistakes_from_whole_captchas(self, actual, wrong_preds):
        self.mistakes = (actual, wrong_preds)
    
    def _format_if_upper(self, char):
        if char.isupper():
            return "cap-" + char
        return char

In [6]:
processor = PreProcessor(source_path, whole_captchas_path, chars_path, filetype)
processor.preprocess()
X, y = processor.process_char_imgs_into_matrix()
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.33, random_state=42)

In [7]:
# For the sake of speed, and showing mistakes, I made the iteration step size of the rotations in the 
# ImageProcessor 5. 
# if you want better accuracy lower the iteration step size of the rotations and/or increase number of epochs
# it'll take longer to preprocess and fit the data though.
# My best and most consistant performance of getting 0 wrong on the test set and solving almost all of the 
# new live catpchas was with a rotation step size of 1, and at least 15 epochs.
cnn = ConvolutionalNeuralNetwork(chars_path, batch_size=32, epochs=15)

In [8]:
cnn.fit(Xtrain,ytrain)

Train on 2342 samples, validate on 1155 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [9]:
cnn.predict(Xtest, ytest)

Test loss : 0.27416828380237124
Test accuracy: 0.9257109693434787


In [10]:
cnn.make_prediction_from_whole_captchas(source_path + "7e5wTG.jpg")

'7e5wTG'

In [11]:
cnn.predict_all_whole_captchas(source_path)

Whole Captcha Test loss: 0.05747126436781613
Whole Captcha Test accuracy: 0.9425287356321839


(0.9425287356321839, 0.05747126436781613)

In [12]:
#note this is out of ~90 captchas
cnn.get_mistakes()

Actual: 6EJrPp   -   Pred: 6FJrPp
Actual: cTaRVd   -   Pred: rTaRVd
Actual: QCsCK3   -   Pred: QCsCKT
Actual: RxeXf4   -   Pred: RxeXY4
Actual: Vce8JC   -   Pred: Hce8JC
