# Load Raw Data

In [14]:
# import modules
import pickle
import numpy as np
import pandas as pd
import os
# import tensorflow as tf

MAX_PRINT = 10

In [9]:

def extract_last_word(line):
  """
  Extracts the last word from a line 
  Input: String with space delimiter
  Output: Word as a string
  """
  words = line.split()
  last_word = words[-1]
  return last_word

def extract_last_words(file_path, MAX_WORDS=np.inf):
  """
  Extracts the last word from each line in the input file and returns the words as a list.
  Input: File path, max number of words (OPTIONAL)
  Output: List of last words
  """
  
  with open(file_path, "r") as f:
    lines = f.readlines()

    last_words = []
  
  for line in lines:
    if "#" in line:
       continue

    last_word = extract_last_word(line)
    last_words.append(last_word)
    if len(last_words) == MAX_WORDS:
      break
  return last_words

words = extract_last_words("./Datasets/ascii/words.txt", 40)
print(words[0:MAX_PRINT])


['A', 'MOVE', 'to', 'stop', 'Mr.', 'Gaitskell', 'from', 'nominating', 'any', 'more']


In [10]:
def extract_filepaths(file_path, MAX_PATHS=float("inf")):
    """
    Takes the words.txt file and extracts all filepaths
    Input: File of all filepaths
    Output: List of unprocessed filepaths (processed with convert_filepath())
    """
    with open(file_path, "r") as f:
        lines = f.readlines()

        file_paths = []
    
        for line in lines:
            if "#" in line:
                continue
            words = line.split()
            current_file_path = words[0] 
            file_paths.append(current_file_path)
            if len(file_paths) == MAX_PATHS:
                break

    return file_paths

paths = extract_filepaths("./Datasets/ascii/words.txt", 40)
print(paths[0:MAX_PRINT])

['a01-000u-00-00', 'a01-000u-00-01', 'a01-000u-00-02', 'a01-000u-00-03', 'a01-000u-00-04', 'a01-000u-00-05', 'a01-000u-00-06', 'a01-000u-01-00', 'a01-000u-01-01', 'a01-000u-01-02']


In [11]:
def convert_filepath(filename):
    """
    Converts filename (string) into proper filepath (string)
    e.g. input: Filename = "a1-00-121-000"
        output: filepath = "a1/a1-00/a1-00-121-000.png"
    """
    parts = filename.split("-")  # Split the filename by "-"
    folder_parts = [parts[0]] + [f"{parts[0]}-{parts[1]}"]  # Exclude the last part (file name with extension)
    
    # Combine the folder parts using "/"
    folder_path = "/".join(folder_parts)
    
    # Join the folder path with the original file name
    filepath = f"{folder_path}/{filename}.png"
    
    return filepath

print(convert_filepath("a01-000u-00-00"))

a01/a01-000u/a01-000u-00-00.png


In [19]:
def extract_filepath_text_dic(file_path, MAX_WORDS=float("inf")):
    """
    Reads a file (like words.txt) and returns a dictionary mapping filepaths (.png) to words (strings)
    Input: File
    Output: Dictionary mapping filepaths to associated words
    """
    with open(file_path, "r") as f:
        lines = f.readlines()

    filepathToWord = {}
  
    for line in lines:
        if "#" in line:
            continue

        text = extract_last_word(line)
        filename = line.split()[0]
        file_path = convert_filepath(filename)
        filepathToWord[file_path] = text

        if len(filepathToWord) == MAX_WORDS:
            break
    return filepathToWord

filepaths_test = extract_filepath_text_dic("./Datasets/ascii/words.txt", 40)
printDic(filepaths_test)

def generate_word_to_filepath(dictionary):
    """
    Creates a word to filepath dictionary
    Input: dictionary that maps filepaths to words
    Output: dictionary that maps words to a list of filepaths
    """
    wordToFilepath = {}

    for filepath in dictionary: #loops by keys
        word = dictionary[filepath]
        if not word in wordToFilepath:
            wordToFilepath[word] = [filepath]
        else:
            wordToFilepath[word].append(filepath)
        
    return wordToFilepath
   
words_test = generate_word_to_filepath(filepaths_test)
printDic(words_test)


a01/a01-000u/a01-000u-00-00.png A
a01/a01-000u/a01-000u-00-01.png MOVE
a01/a01-000u/a01-000u-00-02.png to
a01/a01-000u/a01-000u-00-03.png stop
a01/a01-000u/a01-000u-00-04.png Mr.
a01/a01-000u/a01-000u-00-05.png Gaitskell
a01/a01-000u/a01-000u-00-06.png from
a01/a01-000u/a01-000u-01-00.png nominating
a01/a01-000u/a01-000u-01-01.png any
a01/a01-000u/a01-000u-01-02.png more
a01/a01-000u/a01-000u-01-03.png Labour
A ['a01/a01-000u/a01-000u-00-00.png']
MOVE ['a01/a01-000u/a01-000u-00-01.png']
to ['a01/a01-000u/a01-000u-00-02.png', 'a01/a01-000u/a01-000u-02-01.png', 'a01/a01-000u/a01-000u-05-03.png']
stop ['a01/a01-000u/a01-000u-00-03.png']
Mr. ['a01/a01-000u/a01-000u-00-04.png', 'a01/a01-000u/a01-000u-03-03.png']
Gaitskell ['a01/a01-000u/a01-000u-00-05.png']
from ['a01/a01-000u/a01-000u-00-06.png']
nominating ['a01/a01-000u/a01-000u-01-00.png']
any ['a01/a01-000u/a01-000u-01-01.png']
more ['a01/a01-000u/a01-000u-01-02.png']
Labour ['a01/a01-000u/a01-000u-01-03.png', 'a01/a01-000u/a01-000u-02

# Helper Functions

In [6]:
# ================== DATA Extraction ===================================
# ================== Iterate through all images in subfolders recursively =======================
from tqdm import tqdm

def extract_all(folder_path, MAX_ITER=float('inf')):
  '''
  DESC: for every image report in folder_path, generates a folder per patient with
        OCR, RNFL, and text extractions in .jpg format
  INPUT: folder_path | path to folder with images of reports
  RETURNS: none
  '''
  # Get a list of files in the folder
  file_list = os.listdir(folder_path)

  # Initialize tqdm with the total number of files
  progress_bar = tqdm(file_list, desc='Processing Images', unit='image')
  iter = 0

  for file_name in progress_bar:
    print("Iteration: ", iter, " | File:", file_name)
    if iter >= MAX_ITER:
      break

    # Check if the file has an image extension
    if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        img = os.path.splitext(file_name)[0]  # Get the image name without extension
        image_path = os.path.join(folder_path, file_name)
        output_folder = os.path.join(folder_path, img)

        if not os.path.exists(output_folder): # folder for patient i with extracted data
            os.makedirs(output_folder)

        # do processing
        '''
        extract_rnfl(image_path, output_path=output_folder + '/')
        extract_oct(image_path, output_path=output_folder + '/')
        extract_textbox(image_path, output_path=output_folder + '/')
        '''

        iter += 1

  print("Extraction completed.")


In [18]:
# print formatted dictionary

def printDic(dic, MAX_PRINT=10):
    for i, (key, value) in enumerate(dic.items()):
        if i > MAX_PRINT:
            break
        print(key, value)
    print("==================")

# Save and Load Project States



In [7]:
filepaths_lst = extract_filepaths("./Datasets/ascii/words.txt")
words_lst = extract_last_words("./Datasets/ascii/words.txt")
filepaths_dic = extract_filepath_text_dic("./Datasets/ascii/words.txt")
words_dic = generate_word_to_filepath(filepaths_dic)
# Save all variables and data structures to a file
with open("./variables.pkl", "wb") as file:
    vars =  (
            # list variable names here
            filepaths_lst,
            words_lst,
            filepaths_dic,
            words_dic
            )
    pickle.dump(vars, file)
print("Save Complete")

Save Complete


In [8]:
# Load all variables and data structures from a file
with open("./variables.pkl", "rb") as file:
    (
       filepaths_lst,
            words_lst,
            filepaths_dic,
            words_dic
    ) = pickle.load(file)
print("Load Complete")

Load Complete


In [1]:
# test what we loaded was correct: print first 5 in words_dic
#test = [print(key, value) for i, (key, value) in enumerate(words_dic.items()) if i < 5]