In [37]:
# Install Pillow, pytesseract and translate, via Conda or pip
# Install Tesseract on your PC, https://github.com/UB-Mannheim/tesseract/wiki
from pytesseract import pytesseract

from translate import Translator

import re
import string

import cv2
import os

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20190735\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [5]:
pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Helper functions

In [6]:
# Transcribes image to text
def transcribe(img):
    transcription = pytesseract.image_to_string(img)

    return transcription

In [7]:
# Translate text to given language
def translate(text, lang):
    translator = Translator(to_lang=lang)
    translation = translator.translate(text)

    return translation

In [8]:
translate("Today, we are going to learn how to speak German.", "de")

'Heute lernen wir, Deutsch zu sprechen.'

In [9]:
# Returns true if there is a pattern match
def matches_patterns(text, patterns):
    for pattern in patterns:
        result = re.search(pattern, text, re.IGNORECASE)

        if result:
            print(f"Pattern matched: {result.group()}")

            return result

    return None

# Classification methods

## Option 1: Transcribe images to text, then use simple regular expression matching

In [27]:
#  
def match_patterns_folder(img_folder, patterns):
    for img_name in os.listdir(img_folder):
        # Path to this image
        img_path = os.path.join(img_folder, img_name)
        
        # Try to read image and transcribe it, except if there is a type error
        try:
            img = cv2.imread(img_path)
            txt = transcribe(img)
        except TypeError:
            os.remove(img_path)

        if matches_patterns(txt, patterns):
            print("We have found a login page.")
        else:
            print("We have not found a login page.")

In [28]:
# Example patterns to match (regular expressions)
ptrns = [
    "log.*in",
    "sign.*in",
    "sign.*up"
]

match_patterns_folder(r'data\imgs_login', ptrns)

Pattern matched: Log in
We have found a login page.
We have not found a login page.
Pattern matched: login
We have found a login page.
Pattern matched: Login
We have found a login page.
We have not found a login page.
Pattern matched: Login
We have found a login page.
Pattern matched: LOGIN
We have found a login page.
Pattern matched: Log in
We have found a login page.
Pattern matched: Login
We have found a login page.
Pattern matched: Login
We have found a login page.
We have not found a login page.
Pattern matched: Log In
We have found a login page.
Pattern matched: Login
We have found a login page.
Pattern matched: login_customization_themet.json _login
We have found a login page.
We have not found a login page.
Pattern matched: sign in
We have found a login page.
Pattern matched: Login
We have found a login page.
Pattern matched: Login
We have found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
Pattern matched: Login
W

We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
Pattern matched: Login
We have found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
Pattern matched: Login
We have found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We have not found a login page.
We

## Option 2: Directly do image classification with ML

## Option 3: Transcribe images to text, then do text classification with ML

In [89]:
def transcribe_folder(img_folder, txt_folder):
    for img_name in os.listdir(img_folder):
        img_path = os.path.join(img_folder, img_name)
        
        try:
            img = cv2.imread(img_path)
            txt = transcribe(img)
        except TypeError:
            os.remove(img_path)
        
        txt_name = img_name.partition(".")[0] + ".txt"
        
        with open(os.path.join(txt_folder, txt_name), "a") as file:
            file.truncate(0)
            file.write(txt)
            file.close()

In [14]:
transcribe_folder(r'data\imgs_login', r'data\txts_login')

In [90]:
transcribe_folder(r'data\imgs_creditcard', r'data\txts_creditcard')

In [95]:
#convert to lowercase, strip and remove punctuations
def preprocess(txt_folder):
    for txt_name in os.listdir(txt_folder):
            
        txt_path = os.path.join(txt_folder, txt_name)
        
        if os.path.isdir(txt_path):
            continue
        
        try:
            with open(txt_path) as f:
                text = f.readlines()
        except TypeError:
            os.remove(txt_path);
        
        text = [i.split(' ') for i in text]
        text = [item for sublist in text for item in sublist]
        text = [i for i in text if i not in stopwords.words('english')]
        text = ' '.join(text)
        text = text.lower() 
        text = text.strip()  
        text = re.compile('<.*?>').sub('', text) 
        text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
        text = re.sub('\s+', ' ', text)  
        text = re.sub(r'\[[0-9]*\]',' ',text) 
        text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
        text = re.sub(r'\d',' ',text) 
        text = re.sub(r'\s+',' ',text) 
        text = '\n'.join([i for i in text.split(' ') if len(i) > 1])        
        txt_path_pre = txt_folder + "\preprocessed" 
            
        with open(os.path.join(txt_path_pre, 'PRE-' + str(txt_name) ), "a") as file:
            file.truncate(0)
            file.write(text)
            file.close()

In [96]:
preprocess(r'data\txts_creditcard')