Import libraries

In [None]:
import jsonlines
import json
import glob as glob
import pandas as pd
from collections import Counter
import numpy as np
import difflib
import os
import csv
import matplotlib.pyplot as plt
import re
from difflib import SequenceMatcher

Functions

In [None]:
#Doccano output is in jsonl_files. Load jsonl file into a pandas dataframe
#Syntax is chosen_name = loadjsonl_to_df(jsonl_file)

def loadjsonl_to_df(jsonl_file):
    jsonl_data = []
    with open(jsonl_file, 'r') as file:
        for line in file:
            jsonl_data.append(json.loads(line))
            
        df = pd.DataFrame(jsonl_data)
        return df

In [None]:
#Process reports to remove extra spaces/new lines (necessary for histopathology) 

def preprocess_text(text):
    #remove newline characters (\n) and paragraph markers (\n\n or /)
    text = re.sub(r'\n', ' ', text) #remove newline characters and replace with a single space, with the exception of dates
    return text 

def apply_preprocessing(row):
    return preprocess_text(row['text'])

In [None]:
#Redact text using doccano output
#doccano output is lists of sublists [element 1, element 2, element 3]:
#Element 1: start token
#Element 2: end token
#Element 3: category of PHI

#Assumes df has a column 'text' which is the original text; 'manual_output' which is doccano

def redact_text(row):
    text = preprocess_text(row['text']) #extract text from each row, using the processed text
    manual_output = row['manual_output'] #extract manual_output for each row
    redacted_text = text #initialise redacted text 
    offset = 0 #initialise offset to keep track fo cumulative position changes from preprocessing 

    #iterate over each sublist in manual_output (where manual_output is the column in the df with lists of redacted information)
    for annotation in manual_output:
        start_token, end_token, string = annotation #extract the start, end tokens and string for each sublist
        if string != 'time': #we have decided to remove 'time' as PHI
            update_start = start_token + offset
            update_end = end_token + offset
            redacted_text = redacted_text[:update_start] + '[' + string + ']' + redacted_text[update_end:] #replaced the text from start-end token with string and square brackets
            len_diff = len(string) - (end_token - start_token)
            offset += len_diff + 2 #calculate and add offset, and add +2 to account for the addition of the square brackets

    redacted_text = re.sub(r'(?<=\s)\d{4}(?=\s|$|[.,!?();:])', '[date]', redacted_text) #replace 4-digit numbers preceded by a space with '[date]'

    return redacted_text

In [None]:
#extract lists of redacted words
#other way round; make a list of all redacted words

def extract_redacted_words(row, column):
    text_words = row['reference1'].split()
    column_words = row[column].split()
    redacted_words = [word for word in text_words if word not in column_words]
    return redacted_words