# Download Kaggle W2 forms and Generate Images from PDFs

In [4]:
import locale
import os

import ghostscript
##### must install ghostscript exe on windows https://ghostscript.com/releases/gsdnld.html #####
import numpy
from keras.utils import np_utils
from PIL import Image
import tensorflow as tf
from sklearn.metrics import accuracy_score, classification_report


kaggle_w2_folder = 'enter-path-here'
kaggle_w2_images_folder = 'enter-path-here'

In [6]:
def download_kaggle_dataset():
    os.environ['KAGGLE_USERNAME'] = 'username-here'
    os.environ['KAGGLE_KEY'] = 'key-here'

    from kaggle.api.kaggle_api_extended import KaggleApi

    dataset = 'mcvishnu1/fake-w2-us-tax-form-dataset'
    path = 'datasets/fake_w2'

    api = KaggleApi()
    api.authenticate()

    api.dataset_download_file(dataset, 'W2_Multi_Clean_DataSet_02.ZIP', path)

def extract_kaggle_dataset():
    from zipfile import ZipFile
    file_name = "C:/datasets/fake_w2/W2_Multi_Clean_DataSet_02.ZIP"
    os.chdir('enter-path-here')

    with ZipFile(file_name, 'r') as zip:	
        zip.printdir()	
        print('Extracting all the files now...')
        zip.extractall()
        print('Done!')

def convert_pdfs_to_images():
    def pdf2jpeg(pdf_input_path, jpeg_output_path):
        args = ["pef2jpeg", # actual value doesn't matter
                "-dNOPAUSE",
                "-sDEVICE=jpeg",
                "-r144",
                "-sOutputFile=" + jpeg_output_path,
                pdf_input_path]

        encoding = locale.getpreferredencoding()
        args = [a.encode(encoding) for a in args]

        ghostscript.Ghostscript(*args)

    def get_pdf_files():
        return os.listdir(kaggle_w2_folder) 

    [pdf2jpeg(
        kaggle_w2_folder+pdf_path,
        kaggle_w2_images_folder+pdf_path.replace('pdf','jpeg')) 
            for pdf_path in get_pdf_files() if pdf_path.endswith(".pdf")]

In [7]:

print('downloading kaggle dataset as zip')
download_kaggle_dataset()

print('extracting data from zipped file')
extract_kaggle_dataset()

print('converting pdfs to images')
convert_pdfs_to_images()

print('skip this cell after running it once')

downloading kaggle dataset as zip
extracting data from zipped file
converting pdfs to images
skip this cell after running it once


# Images EDA

In [5]:
from basic_image_eda import BasicImageEDA

BasicImageEDA.explore(kaggle_w2_images_folder)


found 1000 images.
Using 8 threads. (max:8)



100%|██████████| 1000/1000 [00:48<00:00, 20.77it/s]



*--------------------------------------------------------------------------------------*
number of images                         |  1000

dtype                                    |  uint8
channels                                 |  [3]
extensions                               |  ['jpeg']

min height                               |  1584
max height                               |  1584
mean height                              |  1584.0
median height                            |  1584

min width                                |  1224
max width                                |  1224
mean width                               |  1224.0
median width                             |  1224

mean height/width ratio                  |  1.2941176470588236
median height/width ratio                |  1.2941176470588236
recommended input size(by mean)          |  [1584 1224] (h x w, multiples of 8)
recommended input size(by mean)          |  [1584 1216] (h x w, multiples of 16)
recommended input size(

{'dtype': 'uint8',
 'channels': [3],
 'extensions': ['jpeg'],
 'min_h': 1584,
 'max_h': 1584,
 'mean_h': 1584.0,
 'median_h': 1584,
 'min_w': 1224,
 'max_w': 1224,
 'mean_w': 1224.0,
 'median_w': 1224,
 'mean_hw_ratio': 1.2941176470588236,
 'median_hw_ratio': 1.2941176470588236,
 'rec_hw_size_8': array([1584, 1224]),
 'rec_hw_size_16': array([1584, 1216]),
 'rec_hw_size_32': array([1600, 1216]),
 'mean': array([0.9399718 , 0.9399661 , 0.93996894], dtype=float32),
 'std': array([0.23087765, 0.23089147, 0.23088476], dtype=float32)}

# Build TensorFlow Model

# Methods

In [8]:
def get_class_arrays():
    print('sorting w2 class files into arrays')
    adp1s,adp2s,irs1s,irs2s = [],[],[],[]
    for path in os.listdir(kaggle_w2_images_folder):    
        if 'adp1' in path.lower():
            adp1s.append(kaggle_w2_images_folder+path)
        if 'adp2' in path.lower():
            adp2s.append(kaggle_w2_images_folder+path)
        if 'irs1' in path.lower():
            irs1s.append(kaggle_w2_images_folder+path)
        if 'irs2' in path.lower():
            irs2s.append(kaggle_w2_images_folder+path)
    assert len(adp1s) == 250
    assert len(adp2s) == 250
    assert len(irs1s) == 250
    assert len(irs2s) == 250    
    return adp1s,adp2s,irs1s,irs2s

def get_training_features_labels(training_files):
    features = []

    for training_file in training_files:
        features.append(numpy.array(Image.open(training_file)))

    X_train = numpy.array(features).reshape(len(features),1584,1224,3)
    y_train = np_utils.to_categorical(
        numpy.array([0 for i in range(0,25)] + [1 for i in range(0,25)] + [2 for i in range(0,25)] + [3 for i in range(0,25)]))

    assert len(X_train) == len(y_train)
    assert len(X_train) == 100

    return X_train, y_train

def get_test_features_labels(test_files):
    features = []

    for test_file in test_files:
        features.append(numpy.array(Image.open(test_file))) 

    X_train = numpy.array(features).reshape(len(features),1584,1224,3)
    y_train = np_utils.to_categorical(
        numpy.array([0 for i in range(0,25)] + [1 for i in range(0,25)] + [2 for i in range(0,25)] + [3 for i in range(0,25)]))

    assert len(X_train) == len(y_train)
    assert len(X_train) == 100

    return X_train, y_train

def get_label(image_path):
    image_path = image_path.lower()
    
    if 'adp1' in image_path:
        return 0
    if 'adp2' in image_path:
        return 1
    if 'irs1' in image_path:
        return 2
    if 'irs2' in image_path:
        return 3

    raise Exception(f'label not found in {image_path}')

def get_predictions_and_truth(model,_training_files):
    print('making predictions')
    y_predictions = []
    y_true = []

    for path in os.listdir(kaggle_w2_images_folder):
        if kaggle_w2_images_folder + path in _training_files:
            print(f'skipping image used for training: {path}')
            continue
        w2_features = numpy.array(
            Image.open(kaggle_w2_images_folder + path))\
                .reshape(1,1584,1224,3)
        print(f'running prediction on image: {path}')
        y_predictions.append(model.predict(w2_features)[0])
        y_true.append(get_label(path))

    assert len(y_predictions) == len(y_true)
    y_true_categorical = np_utils.to_categorical(y_true)
    return y_true_categorical,numpy.array(y_predictions)

def get_model(num_classes):
    print(f'number of classes in tf.kears: {num_classes}')
    print('preping model')
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(1584,1224,3)),
        tf.keras.layers.Conv2D(9,5,5, padding='same', activation='relu'),
        tf.keras.layers.MaxPooling2D((2,2), padding='same'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')])

    print('compiling model')
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='adam', 
        metrics=['accuracy'])

    return model

def get_training_test_file_paths(): 
    adp1s,adp2s,irs1s,irs2s = get_class_arrays()
    training_files = adp1s[:25] + adp2s[:25] + irs1s[:25] + irs2s[:25]
    test_files = adp1s[26:51] + adp2s[26:51] + irs1s[26:51] + irs2s[26:51]
    return training_files,test_files

def print_model_results(y_true_categorical,y_predictions):
    y_true_categorical = y_true_categorical.astype('int32')
    y_predictions = y_predictions.astype('int32')
    
    print('getting model stats')
    metric = tf.keras.metrics.CategoricalAccuracy()
    metric.update_state(y_true_categorical,y_predictions)
    print('categorical accuracy')
    print(metric.result().numpy())

    accuracy_score_ = accuracy_score(y_true_categorical, y_predictions)
    print(f"Test Accuracy : {accuracy_score_}")

    print("Classification Report :")
    print(classification_report(
        y_true_categorical, 
        y_predictions, 
        target_names=['adp1','adp2','irs1','irs2']))

# Model Training/Predictions

In [9]:
training_files,test_files = get_training_test_file_paths()
X_train, y_train = get_training_features_labels(training_files)
X_test, y_test = get_test_features_labels(test_files)

num_classes=y_train.shape[1]

model = get_model(num_classes)

print('training model')
model.fit(
    X_train, 
    y_train, 
    batch_size=2,
    epochs=5,
    verbose=1,
    validation_data=(X_test, y_test))

y_true_categorical,y_predictions= get_predictions_and_truth(model,training_files)

sorting w2 class files into arrays
number of classes in tf.kears: 4
preping model
compiling model
training model
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
making predictions
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15500.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15501.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15502.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15503.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15504.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15505.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15506.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15507.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean_15508.jpeg
skipping image used for training: W2_Multi_Sample_Data_input_ADP1_clean

# Model Results

Perfect F1-score :)

In [10]:
print_model_results(
    y_true_categorical,
    y_predictions)

getting model stats
categorical accuracy
1.0
Test Accuracy : 1.0
Classification Report :
              precision    recall  f1-score   support

        adp1       1.00      1.00      1.00       225
        adp2       1.00      1.00      1.00       225
        irs1       1.00      1.00      1.00       225
        irs2       1.00      1.00      1.00       225

   micro avg       1.00      1.00      1.00       900
   macro avg       1.00      1.00      1.00       900
weighted avg       1.00      1.00      1.00       900
 samples avg       1.00      1.00      1.00       900

