In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
import matplotlib.pylab as plt
from torch.utils.data import Dataset, DataLoader
from torch.distributions import normal
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import skimage.data as data
import skimage.segmentation as seg
import skimage.filters as filters
import skimage.draw as draw
import skimage.color as color
from skimage import io

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import cv2
from google.colab.patches import cv2_imshow

def image_processing(image):
  #first convert image to grayscale
  gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  (thresh, blackAndWhiteImage) = cv2.threshold(gray_image, 127, 255, cv2.THRESH_BINARY_INV)

  #close holes in characters to make continuous lines
  kernel = np.ones((2,3),np.uint8)
  close = cv2.morphologyEx(blackAndWhiteImage, cv2.MORPH_CLOSE, kernel)

  #open up spaces between defined lines
  opening = cv2.morphologyEx(close, cv2.MORPH_OPEN, kernel)

  #thin lines to remove occluding line
  kernel = np.ones((2,2),np.uint8)
  erosion = cv2.erode(opening,kernel,iterations = 1)

  opening2 = cv2.morphologyEx(erosion, cv2.MORPH_OPEN, np.ones((2,1),np.uint8))

  return opening2

In [4]:
def get_letters(image):
  x, y, w, h = 30, 12, 21, 38
  letters = []
  for  i in range(5):
    # get the bounding rect
    #image = cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 1)
    letter = image[y:y+h,x:x+w]
    letter = cv2.resize(letter, (28, 28))
    letters.append(letter)
    x += w
  return letters

In [5]:
import glob
import cv2
from google.colab.patches import cv2_imshow

download_path = '/content/gdrive/My Drive/Colab_Notebooks/input/samples'
#obtain all images using recursive downloading through glob module
images = glob.glob(download_path + '/*.png')

def get_name(image):
  path = image.split('samples/')
  path = path[1].split('.')
  name = path[0]
  if len(name) > 5:
    print(name)
  return name

def build_character_list(image_paths):
  indiv_characters = []
  indiv_labels = []
  for image_path in image_paths:
    #read in captcha image
    image = io.imread(image_path)
    image_processed = image_processing(image)
    letters = get_letters(image_processed)
    #get name
    name = get_name(image_path)
    for i in range(5):
      indiv_characters.append(letters[i])
      indiv_labels.append(str_to_one_hot(name[i]))
    #print("done!")
  return indiv_characters, indiv_labels

#one hot encoding of labels (bc model doesn't like strings, only tensors)
def str_to_one_hot(str):
  vals = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
          'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  #get the index of the string---this will be its encoded value
  ind = vals.index(str)
  return ind

In [84]:
from collections import Counter
from random import shuffle

def explore(labels):
  tally = Counter(labels)
  return tally

def oversample(images, labels, tally):
  random.seed(42)
  max = tally.most_common()[0][1]
  print(max)
  for i in tally.items():
    count = i[1]
    indices = [j for j, x in enumerate(labels) if x == i[0]]
    while count < max:
      #reshuffle each time to ensure randomness
      shuffle(indices)
      labels.append(labels[indices[0]])
      images.append(np.copy(images[indices[0]]))
      count += 1
  print(Counter(labels))

In [54]:
#takes in a list of images and converts to characters and labels
class CaptchaDataset(Dataset):
    """Captcha dataset."""

    def __init__(self, image_list, transform=None):
        """
        Args:
            image_list (list): list of paths to images
            transforms (list): list of transforms to be applied to the data
        """
        self.characters, self.labels = build_character_list(image_list)
        oversample(self.characters, self.labels, explore(self.labels))
        self.transform = transform
        self.count = 0

    def __len__(self):
        return len(self.characters)

    def __getitem__(self, idx):
        #if the index is a tensor, convert to a list
        if torch.is_tensor(idx):
            idx = idx.tolist()

        x, y = self.characters[idx], self.labels[idx]
        
        if self.transform:
            x, y = self.transform(x,y)
        return x, y

In [7]:
#helper class to convert everything to tensors
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, x, y):
        letter, solution = x, y

        # the numpy array must first be transposed. This is because numpy arrays are of the form [38, 21, 3]
        # the tensor must be in the form [3, 38, 21]
        letter = np.expand_dims(letter, axis=0)
        #letter = np.transpose(letter, (2, 0, 1))
        transp_letter = torch.Tensor(letter)
        # the solution also needs to be converted to a tensor
        solution = torch.tensor(solution,dtype=torch.float32) 
        return transp_letter, solution

In [86]:
import random
#randomly shuffle images to ensure datasets are unbiased
random.seed(42)
random.shuffle(images)
train_dataset = CaptchaDataset(images[:900], transform=ToTensor())
test_dataset = CaptchaDataset(images[900:], transform=ToTensor())
print("length of training dataset: ", len(train_dataset))
print("length of testing dataset: ", len(test_dataset))

#check that the datatypes and shapes of the first four datapoints are correct
for i in range(4):
    x, y = train_dataset[i]
    print(i, y)
    print(x.size())
    print(type(x))

454
Counter({7: 454, 24: 454, 14: 454, 32: 454, 10: 454, 5: 454, 4: 454, 22: 454, 2: 454, 15: 454, 3: 454, 12: 454, 1: 454, 21: 454, 13: 454, 6: 454, 11: 454, 31: 454, 33: 454})
71
Counter({3: 71, 1: 71, 7: 71, 10: 71, 5: 71, 11: 71, 14: 71, 22: 71, 4: 71, 2: 71, 6: 71, 15: 71, 13: 71, 12: 71, 31: 71, 21: 71, 24: 71, 32: 71, 33: 71})
length of training dataset:  8626
length of testing dataset:  1349
0 tensor(7.)
torch.Size([1, 28, 28])
<class 'torch.Tensor'>
1 tensor(24.)
torch.Size([1, 28, 28])
<class 'torch.Tensor'>
2 tensor(14.)
torch.Size([1, 28, 28])
<class 'torch.Tensor'>
3 tensor(32.)
torch.Size([1, 28, 28])
<class 'torch.Tensor'>


In [88]:
train_dataloader = DataLoader(train_dataset, batch_size=256,
                        shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)
#there are 1040 images in the dataset and each image has 5 characters so create batch sizes to account for image splitting

In [89]:
class Flatten(torch.nn.Module):
    def forward(self, x):
        print(x.shape)
        print(x.view(x.shape[0],-1))
        return x.view(x.shape[0], -1)

class Reshape(torch.nn.Module):
    def forward(self, x):
        return x.view(-1,1,28,28)

In [99]:
le_net = torch.nn.Sequential(
    Reshape(),
    nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, padding=2),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
    nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    Flatten(),
    nn.Linear(in_features=16*5*5, out_features=120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.Sigmoid(),
    nn.Linear(84, 35)
)

In [91]:
def evaluate_accuracy_updated(data_iter, net,device=torch.device('cpu')):
    """Evaluate accuracy of a model on the given data set."""
    acc_sum,n = torch.tensor([0],dtype=torch.float32,device=device),0
    for X,y in data_iter:
        # If device is the GPU, copy the data to the GPU.
        X,y = X.to(device),y.to(device)
        net.eval()
        with torch.no_grad():
            y = y.long()
            acc_sum += torch.sum((torch.argmax(net(X), dim=1) == y))
            n += y.shape[0]
    return acc_sum.item()/n

In [92]:
def try_gpu():
    """If GPU is available, return torch.device as cuda:0; else return torch.device as cpu."""
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')
    return device

device = try_gpu()

In [93]:
def train_model_lenet(net, train_iter, test_iter,criterion, num_epochs, batch_size, device,lr=None):
    """Train and evaluate a model with CPU or GPU."""
    print('training on', device)
    net.to(device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        train_l_sum = torch.tensor([0.0],dtype=torch.float32,device=device)
        train_acc_sum = torch.tensor([0.0],dtype=torch.float32,device=device)
        n, start = 0, time.time()
        for X, y in train_iter:
            net.train()
            #print(X.type)
            #print(y.type)
            
            optimizer.zero_grad()
            X,y = X.to(device),y.to(device) 
            #print(X)
            #print(y)
            y_hat = net(X)
            y = y.type(torch.LongTensor)
            #print(y_hat.dtype)
            #print(y.dtype)
            loss = criterion(y_hat, y)
            loss.backward()
            optimizer.step()
            
            with torch.no_grad():
                y = y.long()
                train_l_sum += loss.float()
                train_acc_sum += (torch.sum((torch.argmax(y_hat, dim=1) == y))).float()
                n += y.shape[0]
        test_acc = evaluate_accuracy_updated(test_iter, net,device)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum/n, train_acc_sum/n, test_acc,
                 time.time() - start))

'\ndef train_model_lenet(net, train_iter, test_iter,criterion, num_epochs, batch_size, device,lr=None):\n    """Train and evaluate a model with CPU or GPU."""\n    print(\'training on\', device)\n    net.to(device)\n    optimizer = torch.optim.SGD(net.parameters(), lr=lr)\n    for epoch in range(num_epochs):\n        train_l_sum = torch.tensor([0.0],dtype=torch.float32,device=device)\n        train_acc_sum = torch.tensor([0.0],dtype=torch.float32,device=device)\n        n, start = 0, time.time()\n        for X,y in train_iter:\n            net.train()\n            \n            optimizer.zero_grad()\n            X,y = X.to(device), y.to(device) \n            y_hat = net(X)\n            y = y.type(torch.LongTensor)\n            loss = criterion(y_hat, y)\n            loss.backward()\n            optimizer.step()\n            \n            with torch.no_grad():\n                y = y.long()\n                train_l_sum += loss.float()\n                train_acc_sum += (torch.sum((torch.a

In [102]:
lr, num_epochs = 0.9, 30
batch_size = 256

def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)

le_net.apply(init_weights)
le_net = le_net.to(device)

criterion = nn.CrossEntropyLoss()

train_model_lenet(le_net, train_dataloader, test_dataloader, criterion,num_epochs, batch_size,device, lr)

#Some suggestions for improvement of model: lower outfeatures of first linear call (and also infeatures of second linear call to match)
#try running again w/out canny in image processing and see if just black and white does anything

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         7.2273e-01],
        [4.6180e-01, 5.0078e-01, 7.0230e-01,  ..., 9.3144e-03, 7.9540e-01,
         6.6225e-01]], grad_fn=<ViewBackward0>)
torch.Size([256, 16, 5, 5])
tensor([[0.2809, 0.4298, 0.2384,  ..., 0.8446, 0.5902, 0.4486],
        [0.2305, 0.3654, 0.3073,  ..., 0.3676, 0.3724, 0.6560],
        [0.5903, 0.4972, 0.4768,  ..., 0.5516, 0.3685, 0.6153],
        ...,
        [0.0024, 0.0271, 0.0213,  ..., 0.3300, 0.4699, 0.9361],
        [0.1807, 0.2601, 0.0311,  ..., 0.4625, 0.4923, 0.4668],
        [0.6374, 0.8046, 0.2287,  ..., 0.0022, 0.2435, 0.6895]],
       grad_fn=<ViewBackward0>)
torch.Size([256, 16, 5, 5])
tensor([[1.7266e-03, 2.3606e-04, 2.8203e-04,  ..., 2.3071e-01, 2.4974e-01,
         4.5152e-01],
        [4.0585e-04, 9.5421e-03, 2.6450e-01,  ..., 1.0012e-01, 3.9461e-01,
         5.8641e-01],
        [4.6291e-01, 5.1862e-01, 6.6439e-01,  ..., 7.3878e-01, 7.8528e-01,
         8.8412e-01],
        ...,


In [106]:
for X, y in test_dataloader:
    break

def get_labels(label):
  vals = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
          'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
  #get the index of the string---this will be its encoded value
  return [vals[int(i)] for i in label]

#change code to retrieve labels from tensors!
#numpy array of predictions
print(X.shape)
true_labels = get_labels(y.numpy())
pred_labels = get_labels(le_net(X).argmax(dim=1).numpy())
print()
for i in range(5):
  print("True:", true_labels[i], "Predicted:", pred_labels[i])

torch.Size([256, 1, 28, 28])
torch.Size([256, 16, 5, 5])
tensor([[3.8416e-04, 3.8336e-05, 3.2420e-05,  ..., 6.3846e-02, 2.7396e-01,
         5.0172e-01],
        [5.4203e-03, 2.7668e-03, 1.9716e-03,  ..., 7.7206e-01, 7.1479e-02,
         4.3125e-02],
        [1.0371e-04, 4.8152e-03, 2.4820e-02,  ..., 9.9781e-02, 2.2501e-02,
         1.7990e-01],
        ...,
        [3.4316e-01, 3.8932e-01, 4.1750e-01,  ..., 6.0551e-01, 1.7524e-01,
         3.6695e-02],
        [8.5300e-01, 8.6882e-01, 7.5428e-01,  ..., 8.8803e-01, 7.9810e-01,
         4.1504e-01],
        [1.3703e-01, 2.6476e-01, 5.7755e-01,  ..., 2.0053e-01, 2.8340e-01,
         7.9414e-01]], grad_fn=<ViewBackward0>)

True: 4 Predicted: 4
True: 2 Predicted: 2
True: 8 Predicted: 8
True: b Predicted: b
True: 6 Predicted: 6
