## Step 1: Pre-Preprocessing

In [8]:
# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set

In [2]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "../data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "../data/preprocessed/gold_1/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "../data/preprocessed/gold_full/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL]

## Step 2: Preprocessing

In [3]:
# pick dataset and define loading boolean
data = DATASETS[2]
isLoading = True

In [9]:
# attach data to PreProcessor object.
pp = PreProcessor(data.title)
if isLoading:
    X,y,df = pp.get_data(data.preprocessed_folder,isLoading = isLoading)
else:
    X,y,df = pp.get_data(data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

Loading preprocessed data...


  7% |#####                                                                   |

KeyboardInterrupt: 

In [5]:
# data exploration
df.head()

Unnamed: 0.1,Unnamed: 0,docid,sentence,sentence_ids,labels,labels_ids,characters,padded_sentence,padded_labels
0,0,220-01,"['Record', 'date', ':', '2067', '-', '05', '-'...","[14437, 9285, 6069, 16831, 1405, 17112, 1405, ...","['O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DATE', ...","[1, 1, 1, 37, 13, 13, 13, 13, 1, 1, 43, 1, 1, ...","[(3, 9), (10, 14), (14, 15), (16, 20), (20, 21...","[14437, 9285, 6069, 16831, 1405, 17112, 1405, ...","[1, 1, 1, 37, 13, 13, 13, 13, 1, 1, 43, 1, 1, ..."
1,1,220-01,"['They', 'called', 'us', 'and', 'we', 'increas...","[12586, 9364, 7442, 10900, 16167, 7587, 4226, ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(149, 153), (154, 160), (161, 163), (164, 167...","[12586, 9364, 7442, 10900, 16167, 7587, 4226, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,2,220-01,"['Saw', 'Dr', 'Oakley', '4', '/', '5', '/', '6...","[23627, 14084, 17965, 3434, 5236, 18813, 5236,...","['O', 'O', 'B-DOCTOR', 'B-DATE', 'I-DATE', 'I-...","[1, 1, 35, 37, 13, 13, 13, 13, 1, 1, 1, 1, 1, ...","[(283, 286), (287, 289), (290, 296), (297, 298...","[23627, 14084, 17965, 3434, 5236, 18813, 5236,...","[1, 1, 35, 37, 13, 13, 13, 13, 1, 1, 1, 1, 1, ..."
3,3,220-01,"['To', 'f', '/', 'u', '7', '/', '67', '.']","[14421, 10179, 5236, 1655, 10105, 5236, 24350,...","['O', 'O', 'O', 'O', 'B-DATE', 'I-DATE', 'I-DA...","[1, 1, 1, 1, 37, 13, 13, 1]","[(356, 358), (359, 360), (360, 361), (361, 362...","[14421, 10179, 5236, 1655, 10105, 5236, 24350,...","[1, 1, 1, 1, 37, 13, 13, 1, 0, 0, 0, 0, 0, 0, ..."
4,4,220-01,"['No', 'CP', ""'"", 's', 'since', 'last', 'admit...","[15204, 21625, 21518, 13064, 12029, 14440, 111...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']","[1, 1, 1, 1, 1, 1, 1, 1]","[(370, 372), (373, 375), (375, 376), (376, 377...","[15204, 21625, 21518, 13064, 12029, 14440, 111...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."
