## Step 1: Pre-Preprocessing

In [1]:
# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set

Using TensorFlow backend.


In [2]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "data/preprocessed/gold_1/",
    raw_folders = ["data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL]

## Step 2: Preprocessing

In [9]:
# pick dataset and define loading boolean
data = DATASETS[1]
isLoading = True

In [10]:
# attach data to PreProcessor object.
pp = PreProcessor(data.title)
if isLoading:
    X,y,df = pp.get_data(data.preprocessed_folder,isLoading = isLoading)
else:
    X,y,df = pp.get_data(data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

Loading preprocessed data...
Shape of X:  (22351, 1565)
Shape of y:  (22351, 1565)
Preprocessing complete.
max length:  1565


In [12]:
# data exploration
df.head()

Unnamed: 0.1,Unnamed: 0,docid,sentence,sentence_ids,labels,labels_ids,characters,padded_sentence,padded_labels
0,0,220-01,"['Record', 'date', ':', '2067', '-', '05', '-'...","[19194, 4609, 1268, 17723, 4063, 20430, 4063, ...","['O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', ...","[1, 1, 1, 34, 12, 12, 12, 12, 1, 1, 9, 1, 1, 1...","[(3, 9), (10, 14), (14, 15), (16, 20), (20, 21...","[19194, 4609, 1268, 17723, 4063, 20430, 4063, ...","[1, 1, 1, 34, 12, 12, 12, 12, 1, 1, 9, 1, 1, 1..."
1,1,220-01,"['They', 'called', 'us', 'and', 'we', 'increas...","[18497, 19396, 5707, 11665, 13338, 19563, 323,...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(149, 153), (154, 160), (161, 163), (164, 167...","[18497, 19396, 5707, 11665, 13338, 19563, 323,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,220-01,"['Saw', 'Dr', 'Oakley', '4', '/', '5', '/', '6...","[10697, 1463, 16754, 6584, 20654, 8419, 20654,...","['O', 'O', 'B-DOCTOR', 'B-DATE', 'I-DATE', 'I-...","[1, 1, 3, 34, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1...","[(283, 286), (287, 289), (290, 296), (297, 298...","[10697, 1463, 16754, 6584, 20654, 8419, 20654,...","[1, 1, 3, 34, 12, 12, 12, 12, 1, 1, 1, 1, 1, 1..."
3,3,220-01,"['To', 'f', '/', 'u', '7', '/', '67', '.']","[13816, 1335, 20654, 3592, 4021, 20654, 12589,...","['O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DA...","[1, 1, 1, 1, 34, 12, 12, 1]","[(356, 358), (359, 360), (360, 361), (361, 362...","[13816, 1335, 20654, 3592, 4021, 20654, 12589,...","[1, 1, 1, 1, 34, 12, 12, 1, 0, 0, 0, 0, 0, 0, ..."
4,4,220-01,"['No', 'CP', ""'"", 's', 'since', 'last', 'admit...","[9379, 3649, 13951, 14662, 11841, 17013, 4180,...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']","[1, 1, 1, 1, 1, 1, 1, 1]","[(370, 372), (373, 375), (375, 376), (376, 377...","[9379, 3649, 13951, 14662, 11841, 17013, 4180,...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
