# Preprocessing Student Answer
Create scoring web apps and validate the scoring result.

Install Linux tools and only required for the first run.

In [1]:
!sudo apt-get update
!sudo apt-get install ffmpeg libsm6 libxext6 -y

Hit:1 http://deb.debian.org/debian bookworm InRelease
Get:2 http://deb.debian.org/debian bookworm-updates InRelease [55.4 kB]
Get:3 http://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB]
Hit:4 https://dl.yarnpkg.com/debian stable InRelease                           
Get:5 https://packages.cloud.google.com/apt cloud-sdk InRelease [1620 B]       
Get:6 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [4064 kB]
Get:7 https://packages.cloud.google.com/apt cloud-sdk/main all Packages [1797 kB]
Fetched 5966 kB in 1s (5242 kB/s)  
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:5.1.6-0+deb12u1).
libsm6 is already the newest version (2:1.2.3-1).
libxext6 is already the newest version (2:1.3.4-1+b1).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.


In [1]:
pdf_file = "../data/TestScript.pdf"
standard_answer = pdf_file.replace(".pdf", ".xlsx")

In [2]:
import os

file_name = os.path.basename(pdf_file)
file_name = os.path.splitext(file_name)[0]
base_path = "../marking_form/" + file_name
base_path_images = base_path + "/images/"
base_path_annotations = base_path+"/annotations/"
base_path_questions = base_path+"/questions"
base_path_javascript = base_path+"/javascript"

# create directory tree for base_path_images
os.makedirs(base_path_images, exist_ok=True)
os.makedirs(base_path_annotations, exist_ok=True)
os.makedirs(base_path_questions, exist_ok=True)
os.makedirs(base_path_javascript, exist_ok=True)

In [3]:
import json
annotations_path = base_path_annotations + "annotations.json"
with open(annotations_path, "r") as f: 
    annotations = json.load(f)          

#flatten annotations to list 
annotations_list = []
for page in annotations:
    for annotation in annotations[page]:
        annotation["page"] = int(page)
        # x to left, y to top
        annotation["left"] = annotation["x"]
        annotation["top"] = annotation["y"]
        annotation.pop("x")
        annotation.pop("y")
        annotations_list.append(annotation) 
annotations_list

# convert annotations_list to dict with key with label
annotations_dict = {}
for annotation in annotations_list:
    annotations_dict[annotation["label"]] = annotation
# annotations_dict


In [4]:
# extract list of label from annotations as questions
questions = []
for annotation in annotations_list:
    if annotation["label"] not in questions:
        questions.append(annotation["label"])
# remove 'NAME', 'ID', 'CLASS' if exists in questions
if 'NAME' in questions:
    questions.remove('NAME')
if 'ID' in questions:
    questions.remove('ID')
if 'CLASS' in questions:
    questions.remove('CLASS')    

# sort questions 
questions.sort()
question_with_answer = questions.copy()
questions = ['NAME', 'ID', 'CLASS'] + questions
questions

['NAME',
 'ID',
 'CLASS',
 'A1-5',
 'A11-15',
 'A6-10',
 'B1b1',
 'B1b2',
 'B1b3',
 'B1b4',
 'B1b5',
 'B1b6',
 'B2-1',
 'B2-10',
 'B2-2',
 'B2-3',
 'B2-4',
 'B2-5',
 'B2-6',
 'B2-7',
 'B2-8',
 'B2-9',
 'B3-1',
 'B3-10',
 'B3-2',
 'B3-3',
 'B3-4',
 'B3-5',
 'B3-6',
 'B3-7',
 'B3-8',
 'B3-9',
 'B4-1',
 'B4-10',
 'B4-2',
 'B4-3',
 'B4-4',
 'B4-5',
 'B4-6',
 'B4-7',
 'B4-8',
 'B4-9',
 'B5-1',
 'B5-10',
 'B5-2',
 'B5-3',
 'B5-4',
 'B5-5',
 'B5-6',
 'B5-7',
 'B5-8',
 'B5-9',
 'B6-1',
 'B6-10',
 'B6-2',
 'B6-3',
 'B6-4',
 'B6-5',
 'B6-6',
 'B6-7',
 'B6-8',
 'B6-9']

## Validate Provided Standard Answer for each question

In [5]:
## load standard_answer to dataframe
import pandas as pd
name_list_df = pd.read_excel(standard_answer, sheet_name="NameList")
standard_answer_df = pd.read_excel(standard_answer, sheet_name="Answer")
standard_answer_df.head()

Unnamed: 0,Question,Answer,Mark
0,A1-5,0,10
1,A6-10,0,10
2,A11-15,0,10
3,B1b1,0,1
4,B1b2,0,2


Covert Question to str

In [6]:
standard_answer_df["Question"] = standard_answer_df["Question"].astype(str)

In [7]:
from termcolor import colored

# check question_with_answer in standard_answer_df Question column
for question in question_with_answer:
    if question not in standard_answer_df["Question"].values:
        print(colored("Question {} is not in standard_answer!".format(question), 'red'))

for question in standard_answer_df["Question"].values:
    if question not in question_with_answer:
        print(colored("Question {} is not in annotations!".format(question), 'red'))
            

In [8]:
standard_answer = standard_answer_df.set_index("Question").to_dict()["Answer"]
standard_answer

{'A1-5': 0,
 'A6-10': 0,
 'A11-15': 0,
 'B1b1': 0,
 'B1b2': 0,
 'B1b3': 0,
 'B1b4': 0,
 'B1b5': 0,
 'B1b6': 0,
 'B2-1': 0,
 'B2-2': 0,
 'B2-3': 0,
 'B2-4': 0,
 'B2-5': 0,
 'B2-6': 0,
 'B2-7': 0,
 'B2-8': 0,
 'B2-9': 0,
 'B2-10': 0,
 'B3-1': 0,
 'B3-2': 0,
 'B3-3': 0,
 'B3-4': 0,
 'B3-5': 0,
 'B3-6': 0,
 'B3-7': 0,
 'B3-8': 0,
 'B3-9': 0,
 'B3-10': 0,
 'B4-1': 0,
 'B4-2': 0,
 'B4-3': 0,
 'B4-4': 0,
 'B4-5': 0,
 'B4-6': 0,
 'B4-7': 0,
 'B4-8': 0,
 'B4-9': 0,
 'B4-10': 0,
 'B5-1': 0,
 'B5-2': 0,
 'B5-3': 0,
 'B5-4': 0,
 'B5-5': 0,
 'B5-6': 0,
 'B5-7': 0,
 'B5-8': 0,
 'B5-9': 0,
 'B5-10': 0,
 'B6-1': 0,
 'B6-2': 0,
 'B6-3': 0,
 'B6-4': 0,
 'B6-5': 0,
 'B6-6': 0,
 'B6-7': 0,
 'B6-8': 0,
 'B6-9': 0,
 'B6-10': 0}

In [9]:
standard_mark = standard_answer_df.set_index("Question").to_dict()["Mark"]
standard_mark

{'A1-5': 10,
 'A6-10': 10,
 'A11-15': 10,
 'B1b1': 1,
 'B1b2': 2,
 'B1b3': 2,
 'B1b4': 1,
 'B1b5': 2,
 'B1b6': 2,
 'B2-1': 1,
 'B2-2': 1,
 'B2-3': 1,
 'B2-4': 1,
 'B2-5': 1,
 'B2-6': 1,
 'B2-7': 1,
 'B2-8': 1,
 'B2-9': 1,
 'B2-10': 1,
 'B3-1': 1,
 'B3-2': 1,
 'B3-3': 1,
 'B3-4': 1,
 'B3-5': 1,
 'B3-6': 1,
 'B3-7': 1,
 'B3-8': 1,
 'B3-9': 1,
 'B3-10': 1,
 'B4-1': 1,
 'B4-2': 1,
 'B4-3': 1,
 'B4-4': 1,
 'B4-5': 1,
 'B4-6': 1,
 'B4-7': 1,
 'B4-8': 1,
 'B4-9': 1,
 'B4-10': 1,
 'B5-1': 1,
 'B5-2': 1,
 'B5-3': 1,
 'B5-4': 1,
 'B5-5': 1,
 'B5-6': 1,
 'B5-7': 1,
 'B5-8': 1,
 'B5-9': 1,
 'B5-10': 1,
 'B6-1': 1,
 'B6-2': 1,
 'B6-3': 1,
 'B6-4': 1,
 'B6-5': 1,
 'B6-6': 1,
 'B6-7': 1,
 'B6-8': 1,
 'B6-9': 1,
 'B6-10': 1}

Check for the regeneration of question.

In [10]:
import os
import json

questionAndControl = {}
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file == "control.json":
            question = path[len(base_path_questions) + 1 :]
            f = open(os.path.join(path, file))
            data = json.load(f)
            if "regenerate" in data:
                questionAndControl[question] = data
            f.close()

questionAndControl

{}

In [11]:
from distutils.dir_util import copy_tree
import shutil
import os

from_directory = os.path.join(os.getcwd(), "..","templates", "javascript")
copy_tree(from_directory, base_path_javascript)
ico = os.path.join(os.getcwd(), "..","templates", "favicon.ico")
# copy ico file  to base_path
shutil.copyfile(ico, base_path+"/favicon.ico")

  from distutils.dir_util import copy_tree


'../marking_form/TestScript/favicon.ico'

Generate the index.html

In [12]:
from pathlib import Path
from jinja2 import Environment, FileSystemLoader

file_loader = FileSystemLoader("../templates")
env = Environment(loader=file_loader)
template = env.get_template("index.html")

output = template.render(
    studentsScriptFileName=file_name,
    textAnswer=questions,
    optionAnswer=[],
)
# open text file
path = Path(os.path.join(base_path, "index.html"))
text_file = open(path, "w")
text_file.write(output)
text_file.close()

In [13]:
import easyocr
import tempfile
from PIL import Image, ImageEnhance
import os

easyocrLanguages = ["en"]
reader = easyocr.Reader(easyocrLanguages, gpu=True)

def ocr_image_from_file(image_path, left, top, width, height):
    imageFile = tempfile.NamedTemporaryFile(suffix=".png").name
    with Image.open(image_path) as im:
        # The crop method from the Image module takes four coordinates as input.
        # The right can also be represented as (left+width)
        # and lower can be represented as (upper+height).
        (left, top, right, lower) = (
            left,
            top,
            left + width,
            top + height,
        )
        # Here the image "im" is cropped and assigned to new variable im_crop
        im_crop = im.crop((left, top, right, lower))
        imageEnhance = ImageEnhance.Sharpness(im_crop)
        # showing resultant image
        im_crop = imageEnhance.enhance(3)
        im_crop.save(imageFile, format="png")
    result = reader.readtext(imageFile, detail=0)
    easyocrText = "".join(result)
    text = easyocrText   
    os.remove(imageFile)
    return text

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


### Get embedding 
You can change the logic and model according to https://www.sbert.net/ 

In [14]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

def calculate_similarity(answers, question):
    # Add the standard answer to the head of list.

    if question not in standard_answer:
        ## return list of 0 in len of answers
        return [0] * len(answers)

    answers.insert(0, standard_answer[question])
    # Compute embeddings
    embeddings = model.encode(answers, convert_to_tensor=True)
    # Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    # Find the pairs with the highest cosine similarity scores
    pairs = []
    for j in range(0, len(cosine_scores)):
        pairs.append(float(cosine_scores[0][j]))
    # Empty answer similarity must be 0.
    l = list(map(lambda x: (x[0], 0) if x[0] == "" else x, zip(answers, pairs)))
    similarties = list(list(zip(*l))[1])
    similarties.pop(0)
    return similarties

In [26]:
import os
import pandas as pd


def get_the_list_of_files(path):
    """
    Get the list of files in the directory
    """
    files = []
    for dirpath, dirnames, filenames in os.walk(path):
        files.extend(filenames)
        break
    return sorted(files)


images = get_the_list_of_files(base_path_images)

# get max page from annotations_list
max_page = 0
for annotation in annotations_list:
    if annotation["page"] > max_page:
        max_page = annotation["page"]
max_page = max_page + (1 if max_page % 2 == 1 else max_page + 2) # Scanner will have a blank page!

# filter images by file name divided by page
images_by_page = []
for page in range(max_page):
    images_by_page.append([])
    for image in images:
        p = int(image.split(".")[0])
        if p % max_page == page:
            images_by_page[page].append(image)


def get_df(question):
    row = annotations_dict[question].copy()

    row["Confidence"] = 0.1
    row["Similarity"] = 0
    row["Image"] = images_by_page[row["page"]]
    # append base_path_images to each image
    row["Image"] = ["images/" + image for image in row["Image"]]

    # expend row to dataframe for each image in row["Image"]
    data = pd.DataFrame(row)
    data = data.explode("Image")
    data = data.reset_index(drop=True)
    print("Size of data:", len(data))

    #need to know structure of data to find id, class, and name. Then get the size of data["Answer"]
    print("Data Label 0:", data["label"][0])
    #print("len of data['Answer'] before:", len(data["Answer"]))
    if data["label"][0] in ["ID", "CLASS", "NAME"]:
        data["Answer"] = data.apply(
            lambda row: ocr_image_from_file(
                base_path + "/" + row["Image"],
                row["left"],
                row["top"],
                row["width"],
                row["height"],
            ),
            axis=1,
        )
    else:
        data["Answer"] = data.apply(lambda row: "", axis=1)

    print("Data type of data['Answer']:", data["Answer"].dtype)
    # check the ocr result data["Answer"]
    print("OCR results for data['Answer']:", data["Answer"])
    print("len of data['Answer']", len(data["Answer"]))
    # add column RowNumber
    data["RowNumber"] = data.index + 1
    data["maskPage"] = data["page"]

    #start to ignore ocr
    #similarties = calculate_similarity(data["Answer"].tolist(), question)
    # need to find the output of similartires and then use a fake one to replace it
    similarties = [0] * len(data["Answer"])
    print("Similarties rows:", len(similarties))
    #print("Similarties cols:", len(similarties[0]))
    print("Number of answers:", data["Answer"].count())


    print("Similarties:", similarties)  # Debugging line to check similarties

    data["Similarity"] = similarties

    data["page"] = data["Image"].apply(
        lambda x: x.replace("images/", "").replace(".jpg", "")
    )
    data["Mark"] = data["Answer"].apply(lambda x: "0" if len(x.strip()) == 0 else "")
    return data


def save_template_output(output, question, filename):
    path = Path(base_path_questions, question)
    path.mkdir(parents=True, exist_ok=True)
    path = Path(os.path.join(path, filename))
    text_file = open(path, "w")
    text_file.write(output)
    text_file.close()


# question = "NAME"
# get_df(question)

Generate individual question page.

In [27]:
from ipywidgets import IntProgress
from IPython.display import display

max_count = len(questions)
f = IntProgress(min=0, max=max_count) # instantiate the bar
display(f) # display the bar

for question in questions:
    dataTable = get_df(question)
    os.makedirs(base_path_questions + "/" + question, exist_ok=True)
    dataTable.to_csv(base_path_questions + "/" + question + "/data.csv", index=False)

    if question == "ID" or question == "NAME" or question == "CLASS":
        template = env.get_template("questions/index-answer.html")
    else:
        template = env.get_template("questions/index.html")
    output = template.render(
        studentsScriptFileName=file_name,
        question=question,
        standardAnswer=standard_answer[question] if question in standard_answer else "",
        standardMark=standard_mark[question] if question in standard_mark else "",
        estimatedBoundingBox=annotations_dict[question],
        dataTable=dataTable,
    )
    save_template_output(output, question, "index.html")

    template = env.get_template("questions/question.js")
    output = template.render(
        dataTable=dataTable,
        estimatedBoundingBox=annotations_dict[question],
    )
    save_template_output(output, question, "question.js")

    template = env.get_template("questions/style.css")
    output = template.render(
        dataTable=dataTable,
    )
    save_template_output(output, question, "style.css")
    f.value += 1

IntProgress(value=0, max=62)

Size of data: 52
Data Label 0: NAME
Data type of data['Answer']: object
OCR results for data['Answer']: 0            WU Chun Chung
1            CHAN Ying Wai
2           LOCK Chun Kwan
3          LEUNG Cheuk Hin
4     CHIK Hei Tung Hailey
5          CHAN Cheuk Kwan
6                LI Bailin
7             WONG Yiu Tin
8              MAK Wing Ho
9            HE Shing Yuen
10            TAM Chi Long
11             HONG Peilin
12           CHAN Chun Nok
13             CHAN For Yu
14           CHAN Hau Kwan
15           CAI Wai Shing
16           LAW Hong Yung
17             YU Tsun Hei
18           CHEUNG Ka Chi
19          KWOK Chi Leong
20              LI Wui Kit
21           CHAN Chi Ning
22             LO Hin Wang
23               WU Bingze
24      NG Lai Sang Stella
25           CHAN Aromwhan
26          ZHENG CheukFan
27           TANG Pak Long
28            LEE Kwok Lun
29             LAM Lik Hei
30           KWAN Chun San
31            HO Cheuk Kei
32     HUI Ho Fung Matthew
33   

## Validate Student ID

In [28]:
# load csv file to dataframe
import pandas as pd

id_from_oscr = pd.read_csv(base_path_questions + "/" + "ID" + "/data.csv")["Answer"].tolist()
id_from_namelist = name_list_df["StudentID"].to_list()

# check duplicate id
duplicate_id = []
for id in id_from_oscr:
    if id_from_oscr.count(id) > 1:
        duplicate_id.append(id)
duplicate_id = list(set(duplicate_id))
if len(duplicate_id) > 0:
    print(colored("Duplicate ID: {}".format(duplicate_id), "red"))

id_from_oscr = [str(id) for id in id_from_oscr]
id_from_namelist = [str(id) for id in id_from_namelist]

# compare oscr_id and validate_id
ocr_missing_id = []
name_list_missing_id = []
for id in id_from_oscr:    
    if id not in id_from_namelist:       
        name_list_missing_id.append(id)

for id in id_from_namelist:
    if id not in id_from_oscr:   
        ocr_missing_id.append(id)

## OCR scan error case

In [29]:
from termcolor import colored
if len(ocr_missing_id) > 0:
    print(colored("Some IDs OCR is not in NameList and you need to fix it manually!", "red"))
    for id in name_list_missing_id:
        print(colored(id, "red"))

[31mSome IDs OCR is not in NameList and you need to fix it manually![0m


## Potential Absent Case

In [30]:
from termcolor import colored

if len(ocr_missing_id) > 0:
    print(colored("Number of absentee {}.".format(len(ocr_missing_id)), "red"))
    print(colored("ID in Name List does not find from OCR!", "red"))
    for id in ocr_missing_id:
        print(colored(id, "red"))

[31mNumber of absentee 10.[0m
[31mID in Name List does not find from OCR![0m
[31m240708094[0m
[31m240073742[0m
[31m240112262[0m
[31m240395417[0m
[31m240000577[0m
[31m240421753[0m
[31m240050716[0m
[31m240106046[0m
[31m230235577[0m
[31m230036411[0m


# Start Python HTTPServer

The webserver log is in output/server.log.

If you are in development and don't want the notebook being blocked by running webserver, you can open a terminal and run the below command.

file_name=XXXX python server.py 8000

In [31]:
print("file_name={} python server.py".format(file_name))

file_name=TestScript python server.py


In [19]:
# You can also uncomment the following line to run the web server but if it crashes, you need to restart the kernel.
!cd .. && file_name=TestScript python server.py

 * Serving Flask app 'server'
 * Debug mode: off
 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [23/Jul/2025 05:52:53] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [23/Jul/2025 05:52:56] "GET /questions/CLASS/index.html HTTP/1.1" 200 -
127.0.0.1 - - [23/Jul/2025 05:52:56] "GET /javascript/common.js HTTP/1.1" 200 -
127.0.0.1 - - [23/Jul/2025 05:52:56] "GET /questions/CLASS/style.css HTTP/1.1" 200 -
127.0.0.1 - - [23/Jul/2025 05:52:56] "GET /questions/CLASS/question.js HTTP/1.1" 200 -
[2025-07-23 05:52:57,305] ERROR in app: Exception on /questions/CLASS/control.json [GET]
Traceback (most recent call last):
  File "/home/vscode/.local/lib/python3.11/site-packages/flask/app.py", line 1455, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/vscode/.local/lib/python3.11/site-packages/flask/app.py", line 869, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^

# Post Processing after scoring
1. Check all question has scores.
2. Check ID again.
3. Remove version history.

In [32]:
# check each sub folder of base_path_questions contains file name mark.json, ignore the root folder
import os
import json

unfinsihed_scoring = []
for path, currentDirectory, files in os.walk(base_path_questions):
    if path != base_path_questions:
        # extract question name from path
        question = path[len(base_path_questions) + 1 :]
        if "mark.json" not in files:
            unfinsihed_scoring.append(question)
        else:
            # read mark.json as json
            with open(os.path.join(path, "mark.json"), "r") as f:
                marks = json.load(f)            
            # check each mark in marks that attribute "mark" or "overridedMark" is not empty
            for mark in marks:
                if mark['mark'] == "" and  mark['overridedMark'] == "":
                    # extract question name from path                   
                    unfinsihed_scoring.append(question)
                    break             

if len(unfinsihed_scoring) > 0:            
    print(colored("{} have some question not yet mark!".format(unfinsihed_scoring), "red"))          
else:
    print("All questions have been marked!")

All questions have been marked!


Check ID

In [33]:
import os
import json
from termcolor import colored

with open(os.path.join(base_path_questions,"ID", "mark.json"), "r") as f:
    marks = json.load(f)

id_from_mark = list(map(lambda x: x["overridedMark"] if x["overridedMark"] != "" else x["mark"], marks))
id_from_namelist = name_list_df["StudentID"].to_list()

# convert id_from_mark to string
id_from_mark = [str(id) for id in id_from_mark]
id_from_namelist = [str(id) for id in id_from_namelist]

mark_missing_id = []
for id in id_from_namelist:
    if id not in id_from_mark:   
        mark_missing_id.append(id)
print(colored("In class but not marked - {}!".format(mark_missing_id), "red"))    

marked_but_not_in_namelist = []
for id in id_from_mark:
    if id not in id_from_namelist:   
        marked_but_not_in_namelist.append(id)

print(colored("Marked ID but not in class - {}!".format(marked_but_not_in_namelist), "red"))

[31mIn class but not marked - ['240708094', '240073742', '240112262', '240395417', '240000577', '240421753', '240050716', '240106046', '230235577', '230036411']![0m
[31mMarked ID but not in class - []![0m


### Remove version history
Before you backup.

In [None]:
## remove fill start with control- or mark- and end with .json in base_path_questions recursively.
import os
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file.startswith("control-") or file.startswith("mark-"):
            os.remove(os.path.join(path, file))

### Reset everything (Danger)
Remove mark.js and control.js

In [None]:
# import os
# for path, currentDirectory, files in os.walk(base_path_questions):
    
#     for file in files:       
#         if file == "control.json" or file == "mark.json":
#             os.remove(os.path.join(path, file))