In [5]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
import pickle as pkl
import numpy as np
import copy

## data load and parameter tuning

In [154]:
'''
7	-76	-12	CHEMOTHERAPY	HODGKIN DISEASE	Drug	Indication	for	admission for	Observation admission for	encounter	encounter .	encounter . History	with	old with	year old with	and	and treatment	and treatment with
'''

def load_RF_data(file, train=True):
    measures = []
    
    pkl_file = file.split(".")[0] + "_features_dict.pkl"
    with open(pkl_file, "rb") as f:
        t2i = pkl.load(f)
    
    with open(file, "r") as f:
        y = []
        for line in f:
            info = line[:-1].split("\t")
            if train:
                s = 3
                lable = int(info[0])
                f1 = float(info[1])
                f2 = float(info[2])
                y.append(lable)
            else:
                s = 2
                f1 = float(info[0])
                f2 = float(info[1])
            d = [f1, f2]
            for i in range(s, len(info)):
                if info[i] in t2i:
                    fid = t2i[info[i]]
                else:
                    fid = 0
                d.append(fid)
            measures.append(d)
    return measures, y

In [155]:
def my_eval(y_true, y_pred):
    tp = 0
    fp = 0
    fn = 0
    assert len(y_true) == len(y_pred), "Input data must have same length"
    
    for each in zip(y_true, y_pred):
        if each[1] != 0:
            if each[1] == each[0]:
                tp += 1
            elif each[1] != each[0]:
                fp += 1
        else:
            if each[0] != each[1]:
                fn += 1
    if tp+fp > 0:
        precision=float(tp)/(tp+fp)
    else:
        precision=0
    if tp+fn > 0:
        recall=float(tp)/(tp+fn)
    else:
        recall=0
    if precision + recall > 0:
        fscore=(2*precision*recall)/(precision+recall)
    else:
        fscore =0 
#     print(tp, fp, fn)
    return precision, recall, fscore

In [162]:
def my_eval_detailed(y_true, y_pred):
    assert len(y_true) == len(y_pred), "Input data must have same length"
    cat = dict()
    dtp = dict()
    for t, p in zip(y_true, y_pred):
        if t in cat:
            cat[t] += 1
        else:
            cat[t] = 1
        
        if p not in dtp:
            dtp[p] = [0, 0, 0] #p, y, n
        
        dtp[p][0] += 1
        if p == t:
            dtp[p][1] += 1
        else:
            dtp[p][2] += 1
    res = dict()
    for k, v in cat.items():
#         print("label {}, number {}".format(k, v))
        percision = float(dtp[k][1] / dtp[k][0])
        recall = float(dtp[k][1]/v)
        f_score = (2*percision*recall)/(percision+recall)
#         print("percision: {}, recall: {}, fscore: {}".format(percision, recall, f_score))
        res[k] = [percision, recall, f_score]
    return res

In [163]:
def RF_cv(data, **param):
    print(len(data))
    best_f1 = {"param":"", "score":0}
    clf = RandomForestClassifier(**param, n_jobs=-1)
    true = []
    pred = []
    rs = ShuffleSplit(n_splits=5, test_size=.2, random_state=13)
    for train_index, test_index in rs.split(data):
    #     print("TRAIN:", train_index, "TEST:", test_index)
        X = [data[i][0] for i in train_index]
        y = [data[i][1] for i in train_index]
#                 for each in X:
#                     ll = len(each)
#                     for i in range(ll, 18):
#                         each.append(0)
        clf.fit(X, y)
        X_pred = [data[i][0] for i in test_index]
#                 for each in X_pred:
#                     ll = len(each)
#                     for i in range(ll, 18):
#                         each.append(0)
        y_true = [data[i][1] for i in test_index]
        true.extend(y_true)
        y_pred = clf.predict(X_pred)
        pred.extend(y_pred)
    p, r, f = my_eval(true, pred)
    res = my_eval_detailed(true, pred)
    print(p, r, f)
    best_f1["score"] += f
    p = []
    for k, v in param.items():
        p.append("{}: {}".format(k, v))
#     print(p)
    best_f1["param"] = " ".join(p)
    return best_f1, res

### within sentence

In [293]:
within_file = "SVM_training_data/umass18SVM_within_sent.svm"

In [134]:
def get_feature_num(file):
    pkl_file = file.split(".")[0] + "_features_dict.pkl"
    with open(pkl_file, "rb") as f:
        t2i = pkl.load(f)
    return len(t2i)

In [294]:
X, y = load_RF_data("RF_data/umassRF_training_within.rf")

n_within_data = list(zip(X, y))

for i, each in enumerate(n_within_data):
    if len(each[0]) != 18:
        print(each, i)

n_within_data[0]

([-27.0, -8.0, 3, 4, 5, 6, 0, 0, 0, 7, 8, 9, 7, 10, 11, 12, 13, 14], 7)

In [295]:
len(n_within_data)

64783

In [47]:
rs = ShuffleSplit(n_splits=5, test_size=.2, random_state=1)
rs.get_n_splits(n_within_data)

5

In [None]:
# 'max_features':['sqrt', 'log2', 10],
#               'max_depth':[5, 7, 9]}

In [296]:
best_f1_within = {"param":"", "score":0}
res_f_within = None

for ne in [250, 500, 750, 1000, 1250, 1500]:
    for mf in ['sqrt', 'log2', None]:
        for ct in ['gini', 'entropy']:
            for mss in [2]:
                bf, res = RF_cv(n_within_data, n_estimators=ne, max_features=mf, random_state=13, min_samples_split=mss, criterion=ct)
                print(bf)
                if bf['score'] > best_f1_within['score']:
                    best_f1_within = bf
                    res_f_within = res

64783
0.949672820130872 0.9442975031739315 0.9469775337524204
{'param': 'n_estimators: 250 max_features: sqrt random_state: 13 min_samples_split: 2 criterion: gini', 'score': 0.9469775337524204}
64783
0.9492458322307489 0.9487437185929648 0.948994708994709
{'param': 'n_estimators: 250 max_features: sqrt random_state: 13 min_samples_split: 2 criterion: entropy', 'score': 0.948994708994709}
64783
0.949672820130872 0.9442975031739315 0.9469775337524204
{'param': 'n_estimators: 250 max_features: log2 random_state: 13 min_samples_split: 2 criterion: gini', 'score': 0.9469775337524204}
64783
0.9492458322307489 0.9487437185929648 0.948994708994709
{'param': 'n_estimators: 250 max_features: log2 random_state: 13 min_samples_split: 2 criterion: entropy', 'score': 0.948994708994709}
64783
0.939138625778429 0.9493228946254761 0.9442032988714386
{'param': 'n_estimators: 250 max_features: None random_state: 13 min_samples_split: 2 criterion: gini', 'score': 0.9442032988714386}
64783
0.9401375429821

In [297]:
best_f1_within

{'param': 'n_estimators: 1250 max_features: sqrt random_state: 13 min_samples_split: 2 criterion: entropy',
 'score': 0.949563838223632}

In [298]:
res_f_within

{0: [0.979415612734409, 0.9790100045772576, 0.9792127666532227],
 7: [0.9675421738202007, 0.9630180658873538, 0.9652748189177673],
 5: [0.9614688388931989, 0.9315960912052117, 0.94629676762535],
 1: [0.9632974075152927, 0.9505605058924979, 0.9568865740740741],
 2: [0.9461626575028637, 0.9694835680751174, 0.9576811594202899],
 4: [0.8880503144654088, 0.9417518897287683, 0.9141130772550713],
 3: [0.9634869922409859, 0.9432529043789097, 0.9532625874915331],
 6: [0.9042196918955124, 0.9698275862068966, 0.9358752166377816]}

In [309]:
with open("features_labels/umass18SVM_within_sent_index2label.pkl", "rb") as f:
    x = pkl.load(f)
print(x)

{0: 'NEGATIVE', 1: 'severity_type', 2: 'du', 3: 'manner/route', 4: 'reason', 5: 'fr', 6: 'adverse', 7: 'do'}


### cross data

In [299]:
cross_file = "RF_data/umassRF_training_cross1.rf"
X, y = load_RF_data(cross_file)
len(X)

31406

In [None]:
print(X[:3])

In [300]:
cross_data = list(zip(X, y))

for i, each in enumerate(cross_data):
    if len(each[0]) != 18:
        print(each, i)

print(cross_data[:3])

[([171.0, 26.0, 151, 152, 5, 42, 153, 154, 155, 16, 156, 157, 158, 159, 160, 16, 161, 162], 4), ([194.0, 30.0, 163, 152, 5, 42, 164, 165, 157, 16, 166, 167, 158, 159, 160, 16, 161, 162], 4), ([212.0, 32.0, 168, 152, 5, 42, 16, 169, 170, 171, 172, 173, 158, 159, 160, 16, 161, 162], 4)]


In [65]:
rs = ShuffleSplit(n_splits=5, test_size=.2, random_state=13)
rs.get_n_splits(cross_data)

5

In [301]:
best_f1_cross = {"param":"", "score":0}
res_f_cross = None

for ne in [250, 500, 750, 1000, 1250, 1500]:
    for mf in ['sqrt', 'log2', None]:
        for ct in ['gini', 'entropy']:
            for mss in [2]:
                bf, res = RF_cv(cross_data, n_estimators=ne, max_features=mf, random_state=13, min_samples_split=mss, criterion=ct)
                print(bf)
                if bf['score'] > best_f1_cross['score']:
                    best_f1_cross = bf
                    res_f_cross = res

31406
0.8836283185840708 0.7170556552962298 0.7916749256689792
{'param': 'n_estimators: 250 max_features: sqrt random_state: 13 min_samples_split: 2 criterion: gini', 'score': 0.7916749256689792}
31406
0.8820445609436435 0.7244348762109796 0.7955082742316786
{'param': 'n_estimators: 250 max_features: sqrt random_state: 13 min_samples_split: 2 criterion: entropy', 'score': 0.7955082742316786}
31406
0.8836283185840708 0.7170556552962298 0.7916749256689792
{'param': 'n_estimators: 250 max_features: log2 random_state: 13 min_samples_split: 2 criterion: gini', 'score': 0.7916749256689792}
31406
0.8820445609436435 0.7244348762109796 0.7955082742316786
{'param': 'n_estimators: 250 max_features: log2 random_state: 13 min_samples_split: 2 criterion: entropy', 'score': 0.7955082742316786}
31406
0.8505839710028191 0.7583482944344704 0.8018223234624146
{'param': 'n_estimators: 250 max_features: None random_state: 13 min_samples_split: 2 criterion: gini', 'score': 0.8018223234624146}
31406
0.857085

In [302]:
best_f1_cross

{'param': 'n_estimators: 1000 max_features: None random_state: 13 min_samples_split: 2 criterion: entropy',
 'score': 0.8092354277062832}

In [303]:
res_f_cross

{0: [0.9775533497042853, 0.9874576389616742, 0.9824805339265852],
 7: [0.944078947368421, 0.7377892030848329, 0.8282828282828282],
 2: [1.0, 1.0, 1.0],
 6: [0.8661087866108786, 0.5098522167487685, 0.6418604651162791],
 4: [0.7706349206349207, 0.7887896019496344, 0.7796065837013249],
 3: [0.9672727272727273, 0.9204152249134948, 0.9432624113475179],
 5: [0.9614147909967846, 0.8214285714285714, 0.8859259259259259],
 1: [1.0, 1.0, 1.0]}

In [308]:
with open("features_labels/umass18SVM_cross_sent.svm_index2label.pkl", "rb") as f:
    x = pkl.load(f)
print(x)

{0: 'NEGATIVE', 1: 'severity_type', 2: 'du', 3: 'manner/route', 4: 'reason', 5: 'fr', 6: 'adverse', 7: 'do'}


## Train within and cross models

#### param

```json

#within
{'param': [n_estimators: 500, max_features: 0.5, random_state: 13 min_samples_split: 2],
 'score': 0.9480012656892735}
 
{'param': [n_estimators: 750, max_features: 0.3, random_state: 13, min_samples_split: 2, criterion: entropy],
 'score': 0.9499591126117808}

#cross
{'param': [n_estimators: 1000, max_features: 0.8, random_state: 13, min_samples_split: 2],
 'score': 0.8070641853399164}

{'param': [n_estimators: 1250, max_features: 0.7, random_state: 13, min_samples_split: 2, criterion: entropy],
 'score': 0.8139711465451783}

```

In [201]:
def shuffle_data(X, y):
    d = []
    for each in zip(X, y):
        d.append(each)
    np.random.shuffle(d)
    X_new = []
    y_new = []
    for each in d:
        X_new.append(each[0])
        y_new.append(each[1])
    return X_new, y_new

In [283]:
#within model
X, y = load_RF_data("RF_data/umassRF_training_within.rf")
print(len(X))
X_n, y_n  = shuffle_data(X, y)
#'n_estimators: 750 max_features: 0.3 random_state: 13 min_samples_split: 2 criterion: 'entropy'
clf_within = RandomForestClassifier(n_estimators=500, n_jobs=-1, max_features=0.5, random_state=13, min_samples_split=2)
clf_within.fit(X_n, y_n)

64783


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=13, verbose=0, warm_start=False)

In [284]:
#cross model
cross_file = "RF_data/umassRF_training_cross1.rf"
X, y = load_RF_data(cross_file)
print(len(X))
X_n, y_n  = shuffle_data(X, y)
#'param': 'n_estimators: 1250 max_features: 0.7 random_state: 13 min_samples_split: 2 criterion: 'entropy',
clf_cross = RandomForestClassifier(n_estimators=1000, max_features=0.8, random_state=13, min_samples_split=2)
clf_cross.fit(X_n, y_n)

31406


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=13, verbose=0, warm_start=False)

## Prediction

In [249]:
def output_prediction_result(file, pred_label, pred_prob):
    assert len(pred_label) == len(pred_prob), "Number of prediction labels must be the same as number of prediction probabilities"
    with open(file, "w") as f:
        print("labels 0 1 2 3 4 5 6 7", file=f, end="\n")
        content = ""
        for each in zip(pred_label, pred_prob):
            line = " ".join([str(e) for e in each[1]])
            content += "{} {}\n".format(each[0], line)
        f.write(content)


def load_prediction_data(data_file, feature_file):
    with open(feature_file, "rb") as f:
        t2i = pkl.load(f)
    
    with open(data_file, "r") as f:
        measures = []
        for line in f:
            info = line[:-1].split("\t")
            f1 = float(info[0])
            f2 = float(info[1])
            d = [f1, f2]
            for i in range(2, len(info)):
                if info[i] in t2i:
                    fid = t2i[info[i]]
                else:
                    fid = 0
                d.append(fid)
            measures.append(d)
    return measures

#### task2

In [264]:
cross_feature = "features_labels/umass18SVM_cross_sent.svm_features_dict.pkl"
within_feature = "features_labels/umass18SVM_within_sent_features_dict.pkl"
cross_file = "RF_data/umassRF_testing_cross.rf"
within_file = "RF_data/umassRF_testing_within.rf"
within_pred_file = "umass_predict_res/umassRF_testing_prediction_within_results.txt"
cross_pred_file = "umass_predict_res/umassRF_testing_prediction_cross_results.txt"

In [265]:
#within sentence
X_within = load_prediction_data(within_file, within_feature)
print(X_within[0])
len(X_within)

[37.0, 5.0, 0, 26521, 5, 108, 243, 6690, 0, 41, 44695, 0, 243, 45018, 0, 168, 22437, 0]


11381

In [266]:
within_pred_prob = clf_within.predict_proba(X_within)
within_pred_label = clf_within.predict(X_within)

In [267]:
output_prediction_result(within_pred_file, within_pred_label, within_pred_prob)

In [268]:
#cross sent
X_cross = load_prediction_data(cross_file, cross_feature)
print(X_cross[0])
len(X_cross)

[-76.0, -12.0, 323, 15083, 5, 42, 309, 9532, 0, 0, 0, 0, 227, 24720, 0, 171, 0, 27117]


4217

In [269]:
cross_pred_prob = clf_within.predict_proba(X_cross)
cross_pred_label = clf_within.predict(X_cross)

In [270]:
output_prediction_result(cross_pred_file, cross_pred_label, cross_pred_prob)

#### task 3 e2e

In [285]:
#load prediciton data
cross_file = "e2e/umassRF_testing_cross_e2e.rf"
within_file = "e2e/umassRF_testing_within_e2e.rf"
within_pred_file = "umass_predict_res/umassRF_testing_prediction_within_e2e_results.txt"
cross_pred_file = "umass_predict_res/umassRF_testing_prediction_cross_e2e_results.txt"
cross_feature = "features_labels/umass18SVM_cross_sent.svm_features_dict.pkl"
within_feature = "features_labels/umass18SVM_within_sent_features_dict.pkl"

X_cross = load_prediction_data(cross_file, cross_feature)
X_within = load_prediction_data(within_file, within_feature)

In [286]:
print(X_cross[0])
print(len(X_cross))

[-76.0, -12.0, 323, 15083, 5, 42, 309, 9532, 0, 0, 0, 0, 227, 24720, 0, 171, 0, 27117]
3525


In [287]:
print(X_within[0])
len(X_within)

[42.0, 6.0, 676, 26521, 5, 108, 0, 0, 0, 41, 44695, 0, 243, 45018, 0, 168, 22437, 0]


10426

In [288]:
within_pred_prob = clf_within.predict_proba(X_within)
within_pred_label = clf_within.predict(X_within)

In [289]:
output_prediction_result(within_pred_file, within_pred_label, within_pred_prob)

In [290]:
cross_pred_prob = clf_within.predict_proba(X_cross)
cross_pred_label = clf_within.predict(X_cross)

In [291]:
output_prediction_result(cross_pred_file, cross_pred_label, cross_pred_prob)