# Phase 3 - Endpoints

In the last notebook, we combine and extend the previously developed rules to detect the interesting endpoints. We classify the extracted values in their context in order to decide which endpoint this might be. We do not call the rules from the other notebooks, but simply copied them since they are extend and modified in the context of the final tasks. In the first cell, we develop the rules using the gold standard. In the following cells, we display the errorrnoues sentences, we investigate the evaluation results for each document, we summaries the detects endpoint information in context and profile the developed rules.

In [None]:
%inputDir data-nlp
%outputDir ./temp/endpoint-out
%displayMode NONE
%evalTypes ORR OSMean OSTime OSRate PFSMean PFSTime PFSRate
%saveTypeSystem ./EndpointsTypeSystem.xml
%writescript ./Endpoints.ruta

TYPESYSTEM TrialsTypeSystem;
TYPESYSTEM DKProCoreTypeSystem;

// we reuse the rules of Chapter 2 for brackets

DECLARE Open, Close, InBrackets;
DECLARE InBrackets InRoundBrackets, InSquareBrackets;
DECLARE Open RoundOpen, SquareOpen;
DECLARE Close RoundClose, SquareClose;

FOREACH(special) SPECIAL{}{
    special.ct=="("{->RoundOpen};
    special.ct==")"{->RoundClose};
    special.ct=="["{->SquareOpen};
    special.ct=="]"{->SquareClose};
}
ADDFILTERTYPE(InRoundBrackets);
FOREACH(open, false) RoundOpen{}{
    (open ANY[0,30]{-PARTOF(RoundClose)} RoundClose){-> InRoundBrackets};
}
REMOVEFILTERTYPE(InRoundBrackets);
ADDFILTERTYPE(InSquareBrackets);
FOREACH(open, false) SquareOpen{}{
    (open ANY[0,30]{-PARTOF(SquareClose)} SquareClose){-> InSquareBrackets};
}
REMOVEFILTERTYPE(InSquareBrackets);

DECLARE EndpointInd, EndpointIndEnum;
DECLARE EndpointInd ORRInd, OSInd, PFSInd, OtherInd;
WORDLIST orrIndList = "orr_ind.txt";
WORDLIST osIndList = "os_ind.txt";
WORDLIST pfsIndList = "pfs_ind.txt";
WORDLIST otherIndList = "other_ind.txt";

MARKFAST(ORRInd, orrIndList, true);
MARKFAST(OSInd, osIndList, true);
MARKFAST(PFSInd, pfsIndList, true);
// it is sometimes easier to detect an entity correctly by also detecting 
// something else additionally.
MARKFAST(OtherInd, otherIndList, true);


// this kind of dictionary lookup can create overlapping indicators, remove the smaller ones.
EndpointInd->{ANY epi:EndpointInd{-> UNMARK(epi)};};
i:EndpointInd{i.end==w.begin -> UNMARK(i)} w:W;

//median overall survival (OS)
e1:EndpointInd{-> e1.end=end.end} Open e2:EndpointInd{-> UNMARK(e2)} end:Close;
// enumeration fo indicator could influence the assignment later
((EndpointInd COMMA)* EndpointInd "and" POS_DET? @EndpointInd){-> EndpointIndEnum};

// hotfix sentences, broken char is a question mark
s1:Sentence{ENDSWITH(QUESTION)} s2:@Sentence{->UNMARK(s1),s2.begin=s1.begin};

DECLARE TimeInd (STRING kind);
DECLARE NumericValue (DOUBLE value, DOUBLE min, DOUBLE max, DOUBLE var);
DECLARE Unit (STRING kind);
DECLARE Value (NumericValue value, Unit unit);

TYPE RutaNUM = org.apache.uima.ruta.type.NUM;
DOUBLE num;
WORDTABLE NumberTable = "numbers.csv";
MARKTABLE(NumericValue, 2, NumberTable, true, 2, "", 2, "value" = 1);

BLOCK(NumericValues) Document{}{
    // normal numbers like 1,000.95
    ADDRETAINTYPE(WS);
    (RutaNUM{-PARTOF(NumericValue)} (COMMA RutaNUM{REGEXP("...")}) 
        (PERIOD RutaNUM)?){PARSE(num, "en")-> nv:NumericValue, nv.value=num};
    (RutaNUM{-PARTOF(NumericValue)} (PERIOD RutaNUM)?){PARSE(num, "en")-> nv:NumericValue, nv.value=num};
    (PERIOD{-PARTOF(NumericValue)} RutaNUM){PARSE(num, "en")-> nv:NumericValue, nv.value=num};

    // like twenty-two
    (nv1:NumericValue{PARTOF(W)-> UNMARK(nv1)} 
        SPECIAL.ct=="-" 
        nv2:NumericValue{PARTOF(W)-> UNMARK(nv2)}){-> nv:NumericValue, nv.value = (nv1.value+nv2.value)};
    // intervals like 39-54
    (nv1:NumericValue{-> UNMARK(nv1)} SPECIAL?
        SPECIAL.ct=="-" 
        nv2:@NumericValue{-> UNMARK(nv2)}){-> new:NumericValue, new.min=nv1.value, new.max=nv2.value};
    
    // NEW: we also need to detect variance like 3+/-0.4
    (nv1:@NumericValue{-> nv1.var=nv2.value,nv1.end=nv2.end} "+/-" nv2:NumericValue{-> UNMARK(nv2)});
    
    REMOVERETAINTYPE(WS);
}

// indicators for durations like months
WORDTABLE TimeIndTable = "time_ind.csv";
MARKTABLE(TimeInd, 1, TimeIndTable, "kind"=2);

// something that could hint an arm
DECLARE ArmInd;
// we should probably refactor this to a dictionary
(W{REGEXP("arm", true)} W{REGEXP("[abc]", true)} RutaNUM? COLON?){-> ArmInd};



// indicators that could be useful
DECLARE VSInd,CIInd;
// we could add a wordlist dictionary, but for new we simple classify the words
(W{REGEXP("v|vs|versus")} PERIOD?){-> VSInd};
// confidence interval indicator
(W{REGEXP("CI")}){-> CIInd};
    
// annotate the actual Value (also within brackets)
// 10%
(nv:NumericValue SPECIAL.ct=="%"{-> u:Unit,u.kind="percent"}){-> v:Value, v.value=nv, v.unit=u};
// 12 months
(nv:NumericValue SPECIAL.ct=="-"? ti:TimeInd{-> u:Unit,u.kind=ti.kind}){-> v:Value, v.value=nv, v.unit=u};

ADDFILTERTYPE(InBrackets);

// again ignoring brackets
// 10 (...) months
(nv:NumericValue{-PARTOF(Value)} SPECIAL.ct=="%"{-> u:Unit,u.kind="percent"}){-> v:Value, v.value=nv, v.unit=u};
(nv:NumericValue{-PARTOF(Value)} SPECIAL.ct=="-"? ti:TimeInd{-> u:Unit,u.kind=ti.kind}){-> v:Value, v.value=nv, v.unit=u};


// chunks that could be an arm indicator
Value (POS_ADP{-REGEXP("in")} W[1,2]{-PARTOF(TimeInd),-PARTOF(POS_CONJ),-PARTOF(NumericValue)}){-> ArmInd};
(POS_ADP{-REGEXP("in")} W[1,2]{-PARTOF(TimeInd),-PARTOF(POS_CONJ),-PARTOF(NumericValue)}){-> ArmInd} POS_CONJ @Value;

// now some additional logic for combined mentions
DECLARE ValueEnum;

REMOVEFILTERTYPE(InBrackets);
// 25 vs. 8%
(nv1:NumericValue{-PARTOF(Value)-> v:Value, v.value=nv1, v.unit=v2.unit}
    VSInd v2:Value){-> ValueEnum};
// 2, 3, and 4 months
((NumericValue{-PARTOF(Value) -> v:Value, Value.value=NumericValue, Value.unit=v2.unit} COMMA?)+ 
    W{REGEXP("and")} v2:@Value){->ValueEnum};
// 2- and 3 months
((nv1:NumericValue{-PARTOF(Value)} SPECIAL.ct=="-"?){-> v:Value, v.value=nv1, v.unit=v2.unit}
    W{REGEXP("and")} v2:Value){->ValueEnum};

// again, ignoring brackets
ADDFILTERTYPE(InBrackets);
(nv1:NumericValue{-PARTOF(Value),-PARTOF(ValueEnum)-> v:Value, v.value=nv1, v.unit=v2.unit}
    VSInd v2:Value){-> ValueEnum};
// 2, 3, and 4 months
((NumericValue{-PARTOF(Value),-PARTOF(ValueEnum) -> v:Value, Value.value=NumericValue, Value.unit=v2.unit} COMMA?)+ 
    W{REGEXP("and")} v2:@Value){->ValueEnum};
// 2- and 3 months
((nv1:NumericValue{-PARTOF(Value),-PARTOF(ValueEnum)} SPECIAL.ct=="-"?){-> v:Value, v.value=nv1, v.unit=v2.unit}
    W{REGEXP("and")} v2:Value){->ValueEnum};
REMOVEFILTERTYPE(InBrackets);

// no unit? like "was 0.89"
W{REGEXP("was")} nv:@NumericValue{-PARTOF(Value), nv.value > 0, nv.value < 1 
    -> u:Unit, u.kind="percent", v:Value, v.value=nv, v.unit=u};

// normal value enum with and without brackets
((Value COMMA)* Value COMMA? "and" @Value{-PARTOF(ValueEnum)}){-> ValueEnum};
ADDFILTERTYPE(InBrackets);
((Value COMMA)* Value COMMA? "and" @Value{-PARTOF(ValueEnum)}){-> ValueEnum};

// even more distant combinations
ADDFILTERTYPE(ArmInd,COMMA,POS_CONJ);
v:Value nv:NumericValue{-PARTOF(Value)-> new:Value, new.value=nv, new.unit=v.unit};
nv:NumericValue{-PARTOF(Value)-> new:Value, new.value=nv, new.unit=v.unit} v:@Value ;

// reset filtering
REMOVEFILTERTYPE(InBrackets,ArmInd,COMMA,POS_CONJ);

// some clean up of false positives
DECLARE NoValuePrefixInd, NoValueSuffixInd;
WORDLIST noValuePrefixList = "no_value_prefix.txt";
MARKFAST(NoValuePrefixInd,noValuePrefixList,true);
WORDLIST noValueSuffixList = "no_value_suffix.txt";
MARKFAST(NoValueSuffixInd,noValueSuffixList,true);

v:Value{-> UNMARK(v)} NoValueSuffixInd;
NoValuePrefixInd v:@Value{-> UNMARK(v)};

v:Value{-> UNMARK(v)} CIInd;
CIInd PM? v:Value{-> UNMARK(v)} SPECIAL;


// also extend enum including value
//PFS, OS, and 1-year survival
((EndpointInd COMMA)* EndpointInd COMMA? "and" Value @EndpointInd){-> EndpointIndEnum};

// now the endpoints

// just two helper types for easier rules
DECLARE Percentage, Duration;
v:Value{v.unit.kind=="percent" -> Percentage};
v:Value{v.unit.kind!="percent" -> Duration};

DECLARE InCIBracket;
// brackets that define some confidence interval
InBrackets{CONTAINS(CIInd),-CONTAINS(InBrackets,2,100)-> InCIBracket};

//(95% CI, 6.7 to 11.0 months)
InCIBracket{CONTAINS(Value,1,3)}->{
    v:Value{-PARTOF(Duration)-> UNMARK(v)};
    CIInd # v:@Value{-> UNMARK(v)};
    };



// things that can be ignored concenring the sequential patterns
DECLARE Ignored, IgnoredInd;
WORDLIST ignoredList = "ignored_ind.txt";
MARKFAST(IgnoredInd,ignoredList,true);
IgnoredInd{-PARTOF(Ignored)-> Ignored};

ADDFILTERTYPE(Ignored);


// if we write rules for each endpoint seperately and include all the sequential patterns,
// then this won't end well. Too many unclear rules. Thus, we separate the sequential patterns
// from the semantics by introducting an additional construct "Endpoint", a relation combining
// potentially optional information (Values) with the indicator.
DECLARE Endpoint (EndpointInd indicator, Value mean, Value time, Value rate);

// a macro action for reducing feature assignments later on, for smaller rules
ACTION EP(ANNOTATION aInd, ANNOTATION aMean, ANNOTATION aTime, ANNOTATION aRate) 
    = CREATE(Endpoint, "indicator" = aInd, "mean" = aMean, "time" = aTime, "rate" = aRate);
ACTION Mean(ANNOTATION aInd, ANNOTATION aMean) 
    = CREATE(Endpoint, "indicator" = aInd, "mean" = aMean);
ACTION Time(ANNOTATION aInd, ANNOTATION aTime) 
    = CREATE(Endpoint, "indicator" = aInd, "time" = aTime);
ACTION Rate(ANNOTATION aInd, ANNOTATION aRate) 
    = CREATE(Endpoint, "indicator" = aInd, "rate" = aRate);

// we define different stage of sequential pattern form more specific to more general/simplier exmaples

// It is sometimes really helpful for the maintainability of the rules to add a representative 
// example as a comment where/why the rule should be applied.


// combinations with enums can get complicated
// we start with the simple rules, enums of 2 or 3 should be enough

//The 2-year post-ASCT OS (67% PMLCL vs. 53%, p = 0.78) and PFS (57% PMLCL vs. 36%, p = 0.64)
d:Value{-PARTOF(Endpoint),PARTOF(Duration)}
    ANY[0,5]{-PARTOF(Value),-PARTOF(EndpointInd)}
    e1:EndpointInd{->Time(e1,d)} 
    InBrackets->{v1:@Value{-PARTOF(Endpoint),PARTOF(Percentage)->Rate(e1,v1)};}
    POS_CONJ
    e2:EndpointInd{->Time(e2,d)} 
    InBrackets->{v2:@Value{-PARTOF(Endpoint),PARTOF(Percentage)->Rate(e2,v2)};};

//The 1-year PFS and OS rates were 93% and 100%
d:Value{-PARTOF(Endpoint),PARTOF(Duration)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    (@EndpointIndEnum{CONTAINS(EndpointInd,2,2)} ValueEnum{-CONTAINS(Endpoint)}) ->{
        e1:EndpointInd{->Time(e1,d)} # e2:EndpointInd{->Time(e2,d)} # 
        v1:@Value{-PARTOF(Endpoint),PARTOF(Percentage)->Rate(e1,v1)} #
        v2:Value{-PARTOF(Endpoint),PARTOF(Percentage)->Rate(e2,v2)};
    };


// Median PFS and median OS were 3.1 and 13.8 months
(EndpointIndEnum{CONTAINS(EndpointInd,3,3)} ValueEnum{-CONTAINS(Endpoint)}) ->{
    e1:EndpointInd # e2:EndpointInd # e3:EndpointInd # 
        v1:@Value{-PARTOF(Endpoint)->Mean(e1,v1)} #
        v2:Value{-PARTOF(Endpoint)->Mean(e2,v2)} #
        v3:Value{-PARTOF(Endpoint)->Mean(e3,v3)};
};
(EndpointIndEnum{CONTAINS(EndpointInd,2,2)} ValueEnum{-CONTAINS(Endpoint)}) ->{
    e1:EndpointInd # e2:EndpointInd # 
        v1:@Value{-PARTOF(Endpoint)->Mean(e1,v1)} #
        v2:Value{-PARTOF(Endpoint)->Mean(e2,v2)};
};

//the 1-year, 3-year, and 5-year survival rate was 89.2%, 50.9% and 27.5%
ValueEnum->{d:Value{PARTOF(Duration),-PARTOF(Endpoint)-> Time(i,d)};}
    i:@EndpointInd{-PARTOF(ORRInd)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    ValueEnum->{p:Value{PARTOF(Percentage),-PARTOF(Endpoint)-> Rate(i,p)};};
//The 2-year post-ASCT OS (67% PMLCL vs. 53%, p = 0.78)
d:Value{PARTOF(Duration)-> Time(i,d)}
    ANY[0,3]{-PARTOF(Value),-PARTOF(EndpointInd),PARTOF(Close)}
    i:@EndpointInd{-PARTOF(ORRInd)}
    InBrackets{-PARTOF(InCIBracket)}->{p:Value{PARTOF(Percentage),-PARTOF(Endpoint)-> Rate(i,p)};};
//The one-year survival rate was 55.8%
_{-PARTOF(EndpointInd)}
    d:Value{PARTOF(Duration),-PARTOF(Endpoint)-> Time(i,d)} 
    i:@EndpointInd{-PARTOF(ORRInd)}
    COMMA?
    p:Value{PARTOF(Percentage),-PARTOF(Endpoint)-> Rate(i,p)};
//a 57.6% overall survival (OS) at 62 months
_{-PARTOF(EndpointInd)}
    p:Value{PARTOF(Percentage),-PARTOF(Endpoint)-> Rate(i,p)}
    i:@EndpointInd{-PARTOF(ORRInd)}
    COMMA?
    d:Value{PARTOF(Duration),-PARTOF(Endpoint)-> Time(i,d)};
//a progression free survival (PFS) of 42% at 74 months
i:@EndpointInd {-PARTOF(ORRInd)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    p:Value{PARTOF(Percentage),-PARTOF(Endpoint)-> Rate(i,p)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    d:Value{PARTOF(Duration),-PARTOF(Endpoint)-> Time(i,d)}
     _{-PARTOF(EndpointInd)};
//a progression free survival (PFS) at 74 months of 42% 
i:@EndpointInd{-PARTOF(ORRInd)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    d:Value{PARTOF(Duration),-PARTOF(Endpoint)-> Time(i,d)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    p:Value{PARTOF(Percentage),-PARTOF(Endpoint)-> Rate(i,p)}
    _{-PARTOF(EndpointInd)};
    
// ORR (30% vs. 40%)
i:EndpointInd{-PARTOF(EndpointIndEnum)} 
    InBrackets{-PARTOF(InCIBracket)}->{_{-PARTOF(EndpointInd)} COMMA? v:@Value{-PARTOF(Endpoint)-> Mean(i,v)};};
// ORR 33%
i:EndpointInd{-PARTOF(EndpointIndEnum)}
    v:Value{-PARTOF(Endpoint)-> Mean(i,v)};
// 2-year OS
v:Value{-PARTOF(Endpoint),PARTOF(Duration)-> Time(i,v)}<-{SPECIAL TimeInd;}
    i:@EndpointInd{-PARTOF(EndpointIndEnum)};
// the 30% ORR
v:Value{-PARTOF(Endpoint)-> Mean(i,v)} i:@EndpointInd{-PARTOF(EndpointIndEnum)};
// ORR bla 33%
i:EndpointInd{-PARTOF(EndpointIndEnum)}
    ANY[0,2]{-PARTOF(Value),-PARTOF(EndpointInd)}
    v:Value{-PARTOF(Endpoint)-> Mean(i,v)};

REMOVEFILTERTYPE(Ignored);
// fallbacks within sentences
Sentence{CONTAINS(EndpointInd)}->{
    
    ep:Endpoint.time!=null
        i:EndpointInd{-PARTOF(EndpointIndEnum),-PARTOF(ORRInd)}
        ANY+{-PARTOF(Endpoint),-PARTOF(EndpointInd)} 
        @InBrackets{CONTAINS(VSInd)}->{
            v:@Value{-PARTOF(Endpoint)-> Rate(i,v)};
        };
    i:EndpointInd{-PARTOF(EndpointIndEnum)}
        ANY+{-PARTOF(Endpoint),-PARTOF(EndpointInd)} 
        @InBrackets{CONTAINS(VSInd)}->{
            v:@Value{-PARTOF(Endpoint)-> Mean(i,v)};
        };
    i:EndpointInd{-PARTOF(EndpointIndEnum)}
        ANY+{-PARTOF(Endpoint),-PARTOF(EndpointInd)} 
        v:@Value{-PARTOF(Endpoint),-PARTOF(InBrackets)-> Mean(i,v)};
    
    // enum projection
    //The CR and ORR reached 47% and 88% in the CC arm and 46% and 82% in
    ValueEnum{CONTAINS(Endpoint,2,2)}<-{ep1:Endpoint # ep2:Endpoint;}
        ANY+{-PARTOF(Endpoint)} 
        @ValueEnum{-CONTAINS(Endpoint)}->{
            v1:@Value{-> Mean(ep1.indicator,v1)} # v2:@Value{-> Mean(ep2.indicator,v2)};
        };
    ep:Endpoint
        ANY+{-PARTOF(EndpointInd),-PARTOF(Endpoint)} 
        v:@Value{-PARTOF(Endpoint),-PARTOF(InBrackets)}->{
            v{ep.mean!=null-> Mean(ep.indicator,v)};
            v{ep.rate!=null-> Rate(ep.indicator,v)};
        }
        _{-PARTOF(EndpointInd)};
    //ORR was 28% (41% and 19% in patients...
    ep:Endpoint{ep.mean!=null} Open @ValueEnum->{v:Value{-> Mean(ep.indicator,v)};};
    
    i:EndpointInd{-PARTOF(EndpointIndEnum)}
        ANY+{-PARTOF(Endpoint),-PARTOF(EndpointInd)} 
        v:@Value{-PARTOF(Endpoint),-PARTOF(ValueEnum)-> Mean(i,v)};
    
};

// now we create the actual endpoint annotations based on the relation
FOREACH(ep) Endpoint{}{
    ep.indicator.type==ORRInd->{ep.mean{->e:ORR, e.name="ORR"};};
    ep.indicator.type==OSInd-> {
        ep.mean{->e:OSMean, e.name="OS Mean"};
        ep.time{->e:OSTime, e.name="OS Time"};
        ep.rate{->e:OSRate, e.name="OS Rate"};
    };
    ep.indicator.type==PFSInd-> {
        ep.mean{->e:PFSMean, e.name="PFS Mean"};
        ep.time{->e:PFSTime, e.name="PFS Time"};
        ep.rate{->e:PFSRate, e.name="PFS Rate"};
    };
}



Next, we take a look at the sentences that contain errors: Sentences where the rules above do not yet extract the endpoint information correctly. There are still many errors that could be fixed with some simple rule modifications, but the current state shall suffice for now.

In [None]:
%resetCas
%inputDir ./temp/endpoint-out
%outputDir ./temp/trash
%displayMode CSV
%csvConfig SentenceWithError

DECLARE SentenceWithError;
Sentence{OR(CONTAINS(FalsePositive),CONTAINS(FalseNegative))-> SentenceWithError};

//COLOR(Value, "#D0FFF0");
COLOR(EndpointInd, "#FFFFC0");
COLOR(TruePositive, "lightgreen");
COLOR(FalsePositive, "lightblue");
COLOR(FalseNegative, "pink");

In the next cell, we investigate the evaluation results for the documents.

In [None]:
%resetCas
%inputDir ./temp/endpoint-out
%outputDir ./temp/trash
%displayMode EVALUATION


Finally, we display and store the extracted information about the endpoints.

In [None]:
%resetCas
%inputDir ./temp/endpoint-out
%outputDir ./temp/trash
%displayMode CSV
%csvConfig SentenceExample kind value 

TYPESYSTEM TrialsTypeSystem;
TYPESYSTEM DKProCoreTypeSystem;
TYPESYSTEM EndpointsTypeSystem;

DECLARE SentenceExample(STRING value, STRING kind);
DECLARE Mention(STRING value, STRING kind);
DECLARE Indicator;

e:Endpoint{-> m:Mention, m.kind=entity.name, m.value=v.ct}
    <-{entity:TrialsEntity;}
    <-{v:e.mean!=null;v:e.rate!=null;v:e.time!=null;}
    ->{e.indicator{->Indicator};};

BLOCK(sentence) Sentence{}{
    (# m:@Mention #){->se:SentenceExample, se.kind=m.kind, se.value=m.value};
}

COLOR(Mention, "lightgreen");
COLOR(Indicator, "yellow");