## Step 1: Pre-Preprocessing

In [1]:
# import preprocessing code
from src.preprocess import PreProcessor, df_to_train_set

Using TensorFlow backend.


In [2]:
# save paths to the available datasets
from typing import NamedTuple, List

class Dataset(NamedTuple):
    """
    Interface for accessing data folders.
    """
    title: str
    preprocessed_folder: str
    raw_folders: List[str]

SAMPLE_DATA = Dataset(
    title = "sample_data",
    preprocessed_folder = "../data/preprocessed/sample_data/",
    raw_folders = ["docs/Track1-de-indentification/PHI/"]
)

GOLD_1 = Dataset(
    title = "gold_1",
    preprocessed_folder = "../data/preprocessed/gold_1/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/"]
)

GOLD_FULL = Dataset(
    title = "gold_full",
    preprocessed_folder = "../data/preprocessed/gold_full/",
    raw_folders = ["../data/raw/training-PHI-Gold-Set1/","../data/raw/training-PHI-Gold-Set2/"]
)

GOLD_TEST = Dataset(
    title = "gold_test",
    preprocessed_folder = "../data/preprocessed/gold_full/",
    raw_folders = ["../data/raw/testing-PHI-Gold-fixed/"]
)

DATASETS = [SAMPLE_DATA,GOLD_1,GOLD_FULL, GOLD_TEST]

## Step 2: Preprocessing

In [7]:
# pick dataset and define loading boolean
train_data = DATASETS[2]
test_data = DATASETS[3]
isLoading = False

In [8]:
# attach data to PreProcessor object.
pp = PreProcessor(train_data.title)
if isLoading:
    X_train,y_train,df_train = pp.get_data(train_data.preprocessed_folder,isLoading = isLoading)
else:
    X_train,y_train,df_train = pp.get_data(train_data.raw_folders,isLoading = isLoading)
print("max length: ",pp.max_len)

  0% |                                                                        |

Preprocessing data...


100% |########################################################################|


# of Tag Processing Errors:  48
Files with errors:  ['226-02.xml', '226-04.xml', '254-05.xml', '255-03.xml', '256-01.xml', '256-03.xml', '257-04.xml', '259-04.xml', '270-04.xml', '272-01.xml', '272-03.xml', '272-04.xml', '274-02.xml', '274-03.xml', '278-03.xml', '283-04.xml', '287-02.xml', '287-03.xml', '291-01.xml', '320-01.xml', '329-04.xml', '332-05.xml', '335-03.xml', '336-03.xml', '336-04.xml', '338-03.xml', '338-05.xml', '356-04.xml', '357-03.xml', '357-05.xml', '361-01.xml', '361-03.xml', '361-04.xml', '361-05.xml', '367-02.xml', '392-03.xml', '392-04.xml', '395-05.xml', '400-05.xml', '100-05.xml', '123-04.xml', '124-04.xml', '129-05.xml', '152-02.xml', '155-05.xml', '179-01.xml', '188-05.xml', '189-03.xml']
Shape of X:  (33807, 1567)
Shape of y:  (33807, 1567)
Preprocessing complete.
max length:  1567


In [9]:
# data exploration
df_train.head()

Unnamed: 0,docid,sentence,sentence_ids,labels,labels_ids,characters,padded_sentence,padded_labels
0,220-01,"[Record, date, :, 2067, -, 05, -, 03, Narrativ...","[13111, 22704, 19385, 23427, 2069, 13297, 2069...","[O, O, O, B-DATE, I-DATE, I-DATE, I-DATE, I-DA...","[1, 1, 1, 33, 47, 47, 47, 47, 1, 1, 30, 1, 1, ...","[(3, 9), (10, 14), (14, 15), (16, 20), (20, 21...","[13111, 22704, 19385, 23427, 2069, 13297, 2069...","[1, 1, 1, 33, 47, 47, 47, 47, 1, 1, 30, 1, 1, ..."
1,220-01,"[They, called, us, and, we, increased, her, HC...","[3525, 18585, 7421, 10625, 20307, 22106, 4650,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[(149, 153), (154, 160), (161, 163), (164, 167...","[3525, 18585, 7421, 10625, 20307, 22106, 4650,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,220-01,"[Saw, Dr, Oakley, 4, /, 5, /, 67, -, she, was,...","[447, 11298, 862, 19246, 15876, 16528, 15876, ...","[O, O, B-DOCTOR, B-DATE, I-DATE, I-DATE, I-DAT...","[1, 1, 12, 33, 47, 47, 47, 47, 1, 1, 1, 1, 1, ...","[(283, 286), (287, 289), (290, 296), (297, 298...","[447, 11298, 862, 19246, 15876, 16528, 15876, ...","[1, 1, 12, 33, 47, 47, 47, 47, 1, 1, 1, 1, 1, ..."
3,220-01,"[To, f, /, u, 7, /, 67, .]","[19085, 5148, 15876, 19381, 14960, 15876, 1157...","[O, O, O, O, B-DATE, I-DATE, I-DATE, O]","[1, 1, 1, 1, 33, 47, 47, 1]","[(356, 358), (359, 360), (360, 361), (361, 362...","[19085, 5148, 15876, 19381, 14960, 15876, 1157...","[1, 1, 1, 1, 33, 47, 47, 1, 0, 0, 0, 0, 0, 0, ..."
4,220-01,"[No, CP, ', s, since, last, admit, .]","[7499, 12272, 24817, 16949, 5713, 21909, 12781...","[O, O, O, O, O, O, O, O]","[1, 1, 1, 1, 1, 1, 1, 1]","[(370, 372), (373, 375), (375, 376), (376, 377...","[7499, 12272, 24817, 16949, 5713, 21909, 12781...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [10]:
# load test set
X_test,y_test,df_test = pp.create_test_set(test_data.raw_folders,isLoading, test_data.title)

  0% |                                                                        |

Preprocessing data...


100% |########################################################################|


# of Tag Processing Errors:  69
Files with errors:  ['226-02.xml', '226-04.xml', '254-05.xml', '255-03.xml', '256-01.xml', '256-03.xml', '257-04.xml', '259-04.xml', '270-04.xml', '272-01.xml', '272-03.xml', '272-04.xml', '274-02.xml', '274-03.xml', '278-03.xml', '283-04.xml', '287-02.xml', '287-03.xml', '291-01.xml', '320-01.xml', '329-04.xml', '332-05.xml', '335-03.xml', '336-03.xml', '336-04.xml', '338-03.xml', '338-05.xml', '356-04.xml', '357-03.xml', '357-05.xml', '361-01.xml', '361-03.xml', '361-04.xml', '361-05.xml', '367-02.xml', '392-03.xml', '392-04.xml', '395-05.xml', '400-05.xml', '100-05.xml', '123-04.xml', '124-04.xml', '129-05.xml', '152-02.xml', '155-05.xml', '179-01.xml', '188-05.xml', '189-03.xml', '130-04.xml', '198-01.xml', '200-04.xml', '210-03.xml', '212-03.xml', '214-01.xml', '217-02.xml', '232-03.xml', '260-01.xml', '262-01.xml', '312-02.xml', '313-02.xml', '316-01.xml', '319-01.xml', '319-03.xml', '319-04.xml', '319-05.xml', '342-04.xml', '373-04.xml', '381-04.x