In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.1                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (185.10% [1 InRelease gpgv 1,581 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Hit:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:4 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:6 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates I

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-06-25 20:11:53--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-06-25 20:11:54 (6.13 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("prediction").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
# Import relevant libraries

from google.colab.patches import cv2_imshow
from google.colab.patches import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from tensorflow.keras.models import load_model
import base64
from imageio import imread
import io
from PIL import Image

In [5]:
# Mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
mod_data_file_Path = "/content/drive/MyDrive/Generated_Models/"
mod_data_input_file = "character_recog_model_revision_14.h5"
mod_to_process = mod_data_file_Path + mod_data_input_file
print(mod_to_process)

/content/drive/MyDrive/Generated_Models/character_recog_model_revision_14.h5


In [7]:
# Load trained model
trained_model = load_model(mod_to_process)

In [8]:
# Compile trained model
trained_model.compile(optimizer="adam", loss="binary_crossentropy", metrics = ["accuracy"])

In [9]:
# Create a refence array used for determine model prediction letter

words = {0:'A',1:'B',2:'C',3:'D',4:'E',5:'F',6:'G',7:'H',8:'I',9:'J',10:'K',11:'L',12:'M',13:'N',14:'O',15:'P',16:'Q',17:'R',18:'S',19:'T',20:'U',21:'V',22:'W',23:'X', 24:'Y',25:'Z'}

In [10]:
# Function to predict letter characters using the trained model
xx = []
def predict_letter_from_word(in_read_image):
    # Read the image using cv2
    image = in_read_image

    # make copy of original image, copy will be used to change colors
    image_copy = image.copy()

    # convert image to RGB using cvtColor
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Resize image to 400 x 400 size
    image = cv2.resize(image, (400,400))

    # Add blur to image and greyscale (need to greyscale as cv.threshold needs greyscale images)
    image_copy = cv2.GaussianBlur(image_copy, (7,7), 0)
    grey_image = cv2.cvtColor(image_copy, cv2.COLOR_BGR2GRAY)


    # Separate object from background pixels using thresholding
    # https://docs.opencv.org/4.x/d7/d4d/tutorial_py_thresholding.html
    _, img_thresh = cv2.threshold(grey_image, 100, 255, cv2.THRESH_BINARY_INV)

    # Resize and reshape image to fit trained_model requirements
    final_image = cv2.resize(img_thresh, (28,28))
    # xx.append(final_image)
    # print(final_image)
    final_image = np.reshape(final_image, (1, 28, 28, 1))

    # Make prediction using the trained_model
    prediction = words[np.argmax(trained_model.predict(final_image))]

    return(prediction)

In [11]:
def determine_expected_character_from_image_file_name(in_image_file_name):
# Determine image character from file name A = img011-xxx.png Z = img036-xxx.png : Formula to determine ASCII value : (int(image_name[4:6]) - 11) + 65 : Convert ASCII value to character : chr(ascii_value)
    
    return_image_character = ""
    image_reference_integer = int(in_image_file_name[4:6])
    image_character_ASCII = (image_reference_integer-11) + 65
    return_image_character = chr(image_character_ASCII)

    return(return_image_character)

In [12]:
def convert_image_to_base64(in_image_file_name):
# Function that takes in image file name reads in mage and convert to a base 64 string
  file_name = (in_image_file_name)
  with open(file_name, "rb") as img_file:
      b64_string = base64.b64encode(img_file.read())

  return(b64_string.decode('utf-8'))

In [13]:
def convert_from_base_64_to_image(in_base_64_image_string):
# Function that takes in base 64 string and converts to image
  image_webP = Image.open((io.BytesIO(base64.b64decode(in_base_64_image_string))))

  image_RGB = image_webP.convert("RGB")
  
  image = np.float32(image_RGB)

  return(image)

In [14]:
def determine_prediction_success_or_fail(in_expected_character,in_predicted_character):

  if in_expected_character ==  in_predicted_character:
    prediction_match = True
  else:
    prediction_match = False

  return(prediction_match)

In [15]:
# Create log reference
validation_run_reference = dt.datetime.now(dt.timezone.utc)

# Read images to be used for validation run
column_names = ['run_date_time','image_reference','image_category','image_character','predicted_character','prediction_result','model_reference']
prediction_results_df = pd.DataFrame(columns=column_names)
prediction_results_df

# Read images to be used for validation run
directory_path = '/content/drive/MyDrive/Resized_images/'
directory_listing = os.listdir(directory_path)

for image_name in directory_listing:

    image_file_name = (directory_path + image_name)

    # Determine expected character
    expected_character = determine_expected_character_from_image_file_name(image_name)
    
    # COnvert image to base 64 string
    base_64_image_string = convert_image_to_base64(image_file_name)

    # Return an image from the base64 string 
    image_returned = convert_from_base_64_to_image(base_64_image_string)

    # Call letter prediction
       
    predicted_letter = predict_letter_from_word(image_returned)
    prediction_result = determine_prediction_success_or_fail(expected_character,predicted_letter)

    # Create a dataframe with a single entry of the results
    result_list = [validation_run_reference,image_name,"",expected_character,predicted_letter,prediction_result,mod_data_input_file]
    result_data_frame_entry = pd.DataFrame([result_list],columns=column_names)
    result_data_frame_entry

    # Output word built from prediction
    # print(image_name , expected_character,predicted_letter)
    prediction_results_df = prediction_results_df.append(result_data_frame_entry,ignore_index=True)


In [16]:
# Convert dataframe to sparks dataframe

df_spark = spark.createDataFrame(prediction_results_df)

In [18]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://finalpostgresdb.cxwdymdhaxq6.us-east-1.rds.amazonaws.com:5432/my_final_project"
config = {"user":"root", "password": "Open4039!", "driver":"org.postgresql.Driver"}

df_spark.write.jdbc(url=jdbc_url, table="final_model_validation_run_logs", mode=mode, properties=config)