In [1]:
import numpy as np
import scipy as sp
import re
import pandas as pd
from collections import Counter
import sklearn 
from sklearn.metrics import classification_report 
# pip install word2number
from word2number import w2n

import matplotlib.pyplot as plt
%matplotlib inline

# pip install pandas_confusion
import pandas_confusion

from statistics import mode

In [2]:
crowd_extractions = pd.read_csv("crowd-extractions.csv")
# the -main-task.csv implies it excludes the questions used
# for testing/validation
gold_extractions = pd.read_csv("yalin-gold-main-task.csv", encoding='latin1')

In [3]:
#crowd_extractions.columns.values
#gold_extractions.columns.values
#crowd_extractions.head()
def my_accuracy_score(y_true, y_pred):
    length = len(y_true)
    count = .0
    for a,b in zip(y_true, y_pred):
        if a == b:
            count += 1
    return count/length

In [4]:
# Skip the NAcolumns(contain more than 90% NA) in crowd_extractions and gold_extractions
def Nan_columns(extraction):
    sample_size =  extraction.shape[0]
    extractions_NANcolumns = []
    for value in extraction.columns.values:
        nan_num = extraction[value].isnull().sum()
        if nan_num >= sample_size*0.9: 
            #print "feature %s all Nan"%value
            extractions_NANcolumns.append(value)
    return extractions_NANcolumns
crowd_extractions_NANcolumns = Nan_columns(crowd_extractions)
gold_extractions_NANcolumns = Nan_columns(gold_extractions)
gold_extractions_NANcolumns = [s.encode('utf-8') for s in gold_extractions_NANcolumns]
Nan_columns = list(set(crowd_extractions_NANcolumns+gold_extractions_NANcolumns))

common_columns = [x for x in crowd_extractions.columns.values if (x in gold_extractions.columns.values) and (x not in Nan_columns)]
#missed_crowd_columns = [x for x in crowd_extractions.columns.values if (x not in Nan_columns) and (x not in common_columns)]
#missed_gold_columns = [x for x in gold_extractions.columns.values if (x not in Nan_columns) and (x not in common_columns)]
for field in common_columns:
    print field 
    print

# crowd_gold_columns map the columns from crowd_extractions to gold_extractions
# 'arm_num' equals to 'arm_num_gold'
# We also remove field: 'pmid', 'abstract'
crowd_gold_columns = {key:key for key in common_columns}
crowd_gold_columns['arm_num'] = 'arm_num_gold'
crowd_gold_columns.pop('abstract', None)
crowd_gold_columns.pop('pmid', None)
print crowd_gold_columns.keys()


pmid

group_1_dose

group_1_intervention_name

group_1_sample_size

group_1_schedule

group_2_dose

group_2_intervention_name

group_2_sample_size

group_3_intervention_name

share_duration

shared_duration_yn

shared_intervention_name

shared_intervention_name_yn

shared_intervention_type

shared_intervention_type_yn

shared_route

shared_route_yn

shared_sample_size

shared_sample_size_yn

shared_schedule

shared_schedule_yn

abstract

['group_1_sample_size', 'group_1_intervention_name', 'shared_schedule', 'shared_intervention_type', 'shared_route_yn', 'share_duration', 'shared_duration_yn', 'group_1_dose', 'shared_sample_size', 'shared_schedule_yn', 'group_1_schedule', 'group_2_intervention_name', 'shared_intervention_name_yn', 'shared_route', 'shared_intervention_type_yn', 'group_2_sample_size', 'shared_sample_size_yn', 'arm_num', 'group_3_intervention_name', 'shared_intervention_name', 'group_2_dose']


In [5]:
#gold_extractions.head()

In [6]:
all_pmids = [int(pmid) for pmid in gold_extractions["pmid"].values]

In [7]:
# Byron's original code
# Word2number package update a new error handling for invalid input 
# and it throws a string "Error: Please enter a valid number word" instead of the Exception
# I fix the bug
crowd_num_arms, reference_num_arms = {}, {}
for pmid in all_pmids:
    #print("on PMID: %s" % pmid)
    pmid_crowd_num_arms  = crowd_extractions[crowd_extractions["pmid"] == pmid]["arm_num"].values
    crowd_responses = []
    for n_arms in pmid_crowd_num_arms:
        w2n_number = w2n.word_to_num(n_arms)
        if type(w2n_number) != type(1):
            print("failed on %s for crowd! arm responses: %s" % (pmid, n_arms))
            print 
        else:
            crowd_responses.append(w2n.word_to_num(n_arms)) #w2n.word_to_num(n_arms) for n_arms in pmid_crowd_num_arms
    crowd_num_arms[pmid] = mode(crowd_responses)
    
    cur_ref_val = gold_extractions[gold_extractions["pmid"] == pmid]["arm_num_gold"].values[0]
    w2n_number = w2n.word_to_num(cur_ref_val)
    if type(w2n_number) != type(1):
        print("failed on %s for expert! ref val: %s" % (pmid, cur_ref_val))
        print
    else:
        reference_num_arms[pmid] = w2n.word_to_num(cur_ref_val)

        # w2n.word_to_num(gold_extractions[gold_extractions["pmid"] == int(pmid)]["arm_num_gold"])
    #print()

failed on 2619027 for expert! ref val: ???

failed on 7853047 for expert! ref val: ???

failed on 8018001 for crowd! arm responses: above_eight

failed on 8018001 for expert! ref val: more than 8

failed on 9549451 for crowd! arm responses: above_eight

failed on 12069289 for expert! ref val: one?

failed on 17855465 for expert! ref val: one?

failed on 18851769 for expert! ref val: ???



In [8]:
# Byron's original code
y, y_hat = [], []
for pmid in all_pmids:
    if all([pmid in key_set for key_set in [reference_num_arms.keys(), 
                                                crowd_num_arms.keys()]]):
        y.append(reference_num_arms[pmid])
        y_hat.append(crowd_num_arms[pmid])

print(classification_report(y, y_hat))

             precision    recall  f1-score   support

          2       0.84      1.00      0.91        46
          3       0.86      0.75      0.80         8
          4       1.00      0.22      0.36         9
          6       1.00      0.50      0.67         2
          7       1.00      1.00      1.00         1

avg / total       0.87      0.85      0.82        66



In [9]:
# Replace difficulty 'Nan' with -1 
#crowd_extractions.difficulty.fillna(value=-1,inplace=True)
#crowd_extractions.arm_num.value_counts(dropna=False)
#gold_extractions.arm_num_gold.value_counts(dropna=False)

In [10]:
# find most common element in a list
# function 'statistics.mode' will raise error when there are more than one most common element
def most_common(l):
    return Counter(l).most_common(1)[0][0]

In [11]:
# Convert string containing number to integer, e.g., "24 year" -> 24.0, "above_eight"->8.0 
def Num_to_int(s):
    try:
        if np.isnan(s):
            return -1.0
    except TypeError:
        pass
    if isinstance(s, float) or isinstance(s, int):
        return s
    else:
        s = s.encode('utf-8')
    num_d = w2n.american_number_system
    s = re.sub('[^0-9a-zA-Z.]+', ' ', s).strip()
    l_num = re.findall(r"[-+]?\d*\.\d+|\d+",s)
    if len(l_num) >= 1:
        return float(l_num[0])
    l_word = s.split(' ')
    for word in l_word:
        if word not in num_d:
            l_word.remove(word)
    s = ' '.join(l_word)
    s = w2n.word_to_num(s)
    if type(s) == type(1):
        return float(s)
    return -1.0

In [12]:
print Num_to_int('above_eight')
print Num_to_int('???')
print Num_to_int('one?')
print Num_to_int('more than 8')
print Num_to_int('smaller than 8')
print Num_to_int('8?')

8.0
-1.0
1.0
8.0
8.0
8.0


In [13]:
# calculate the accuracy score of numerical fields
def confusion_crowd_gold_1(crowd_extractions, gold_extractions, crowd_field, gold_field):
    crowd_num_arms, reference_num_arms = {}, {}
    for pmid in all_pmids:
        if pmid not in crowd_extractions.pmid.unique().tolist():
            continue
        #print("on PMID: %s" % pmid)
        pmid_crowd_num_arms  = crowd_extractions[crowd_extractions["pmid"] == pmid][crowd_field].values
        crowd_responses = []
        for n_arms in pmid_crowd_num_arms:
            #w2n_number = w2n.word_to_num(n_arms)
            w2n_number = Num_to_int(n_arms)
            crowd_responses.append(w2n_number)
        if len(crowd_responses)>0:
            try:
                crowd_num_arms[pmid] = mode(crowd_responses)
            except:
                crowd_num_arms[pmid] = most_common(crowd_responses)

        cur_ref_val = gold_extractions[gold_extractions["pmid"] == pmid][gold_field].values[0]
        #w2n_number = w2n.word_to_num(cur_ref_val)
        w2n_number = Num_to_int(cur_ref_val)
        reference_num_arms[pmid] = w2n_number
    y_gold, y_crowd = [], []
    for pmid in all_pmids:
        if all([pmid in key_set for key_set in [reference_num_arms.keys(), 
                                                    crowd_num_arms.keys()]]):
            y_gold.append(reference_num_arms[pmid])
            y_crowd.append(crowd_num_arms[pmid])

    print "crowd column is %s, gold column is %s "%(crowd_field, gold_field)
    print "gold extraction"
    print y_gold
    print 
    print "crowd extraction"
    print y_crowd
    print
    accuracy = my_accuracy_score(y_gold, y_crowd)
    print "accuracy is %s"%accuracy
    #confusion_matrix = classification_report(y, y_hat)
    #print confusion_matrix
    return accuracy

In [14]:
# view the content of crowd extractions

#all_value = []
#for key in crowd_extractions.columns.values:
#    if key in crowd_gold_columns:
#        print key,",", crowd_gold_columns[key]
#        print crowd_extractions[key].unique().tolist()
#        print gold_extractions[crowd_gold_columns[key]].unique().tolist()
#        print


In [15]:
# calculate the accuracy score of numerical fields 
confusion_crowd_gold_1(crowd_extractions, gold_extractions, 'arm_num', 'arm_num_gold')
print "---"*10
confusion_crowd_gold_1(crowd_extractions, gold_extractions, 'shared_sample_size', 'shared_sample_size')
print "---"*10
confusion_crowd_gold_1(crowd_extractions, gold_extractions, 'group_1_sample_size', 'group_1_sample_size')
print "---"*10
confusion_crowd_gold_1(crowd_extractions, gold_extractions, 'group_2_sample_size', 'group_2_sample_size')
print "---"*10
print 

crowd column is arm_num, gold column is arm_num_gold 
gold extraction
[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, -1.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 7.0, 4.0, 3.0, -1.0, 2.0, 8.0, 2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 4.0, 2.0, 2.0, 2.0, 4.0, 4.0, 4.0, 2.0, 6.0, 4.0, 2.0, 2.0, 1.0, 4.0, 2.0, 4.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, -1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 6.0]

crowd extraction
[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 7.0, 4.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 6.0]

accuracy is 0.777777777778
------------------------------
crowd column is shared_sample_size, gold column is shared_sample_size 
gold extraction
[60.0, -1.0, 96.0, 55.0, -1.0, -1.0, -1.0

In [16]:
# Convert string containing time duration to integer, e.g., "24 year" -> 24.0, "above_eight"->8.0 
def Duration_to_int(s):
    number = -1
    unit = -1
    try:
        if np.isnan(s):
            return -1.0
    except TypeError:
        pass
    if isinstance(s, float) or isinstance(s, int):
        return s
    else:
        s = s.encode('utf-8')
    unit_d = {'year':365, 'month':30, 'week':7, 'day':1, 'hour': 1.0/24, 
              'minute': 1.0/(24*60), 'second': 1.0/(24*60*60),'years':365, 
              'months':30, 'weeks':7, 'days':1, 'hours': 1.0/24, 
              'minutes': 1.0/(24*60), 'seconds': 1.0/(24*60*60)}
    num_d = w2n.american_number_system
    
    s = re.sub('[^0-9a-zA-Z.]+', ' ', s).strip()
    l_num = re.findall(r"[-+]?\d*\.\d+|\d+",s)
    if len(l_num) >= 1:
        number = float(l_num[0])
    if number == -1:
        l_word = s.split(' ')
        for word in l_word:
            if word not in num_d:
                l_word.remove(word)
        ns = ' '.join(l_word)
        ns = w2n.word_to_num(ns)
        if type(ns) == type(1):
            number = float(ns)

    l_word = s.split(' ')
    for word in l_word:
        if word in unit_d:
            unit = unit_d[word]
            break
            
    if (number != (-1))&(unit != (-1)):
        return float(number*unit)
    elif(number == (-1))&(unit != (-1)):
        return float(unit)
    elif(number != (-1))&(unit == (-1)):
        return float(number)
    else:
        return -1

In [17]:
print Duration_to_int('a week')
print Duration_to_int('3-hour')
print Duration_to_int('26 weeks')
print Duration_to_int('6 hours')
print Duration_to_int('one month')
print Duration_to_int('5 days')
print Duration_to_int('90 days')
print Duration_to_int('five weeks')
print Duration_to_int('two periods of 14 days')
print Duration_to_int('two periods')
print Duration_to_int('14 days')

7.0
0.125
182.0
0.25
30.0
5.0
90.0
35.0
14.0
2.0
14.0


In [18]:
# calculate the accuracy score of frequency and time duration fields
def confusion_crowd_gold_2(crowd_extractions, gold_extractions, crowd_field, gold_field):
    crowd_num_arms, reference_num_arms = {}, {}
    for pmid in all_pmids:
        if pmid not in crowd_extractions.pmid.unique().tolist():
            continue
        #print("on PMID: %s" % pmid)
        pmid_crowd_num_arms  = crowd_extractions[crowd_extractions["pmid"] == pmid][crowd_field].values
        crowd_responses = []
        for n_arms in pmid_crowd_num_arms:
            if isinstance(n_arms, float) or isinstance(n_arms, int):
                w2n_number = Duration_to_int(n_arms)
            else:
                w2n_number = Duration_to_int(n_arms)
                if w2n_number != -1:    
                    w2n_number = 1
                    n_arms = n_arms.encode('utf-8')
                    n_arms_list = n_arms.split(' of ')
                    for sentence_part in n_arms_list:
                        temp_score = Duration_to_int(sentence_part)
                        if temp_score != -1:
                            w2n_number = w2n_number * temp_score
            crowd_responses.append(w2n_number)
        if len(crowd_responses)>0:
            try:
                crowd_num_arms[pmid] = mode(crowd_responses)
            except:
                crowd_num_arms[pmid] = most_common(crowd_responses)

        cur_ref_val = gold_extractions[gold_extractions["pmid"] == pmid][gold_field].values[0]
        #w2n_number = w2n.word_to_num(cur_ref_val)
        if isinstance(cur_ref_val, float) or isinstance(cur_ref_val, int):
            w2n_number = Duration_to_int(cur_ref_val)
        else:
            w2n_number = Duration_to_int(cur_ref_val)
            if w2n_number != -1:    
                w2n_number = 1
                cur_ref_val = cur_ref_val.encode('utf-8')
                cur_ref_val_list = cur_ref_val.split(' of ')
                for sentence_part in cur_ref_val_list:
                    temp_score = Duration_to_int(sentence_part)
                    if temp_score != -1:
                        w2n_number = w2n_number * temp_score
       
        reference_num_arms[pmid] = w2n_number
    y_gold, y_crowd = [], []
    for pmid in all_pmids:
        if all([pmid in key_set for key_set in [reference_num_arms.keys(), 
                                                    crowd_num_arms.keys()]]):
            y_gold.append(reference_num_arms[pmid])
            y_crowd.append(crowd_num_arms[pmid])

    print "crowd column is %s, gold column is %s "%(crowd_field, gold_field)
    print "gold extraction"
    print y_gold
    print 
    print "crowd extraction"
    print y_crowd
    print
    #accuracy = accuracy_score(y_gold, y_crowd)
    accuracy = my_accuracy_score(y_gold, y_crowd)
    print "accuracy is %s"%accuracy
    #confusion_matrix = classification_report(y, y_hat)
    #print confusion_matrix
    return accuracy

In [19]:
# calculate the accuracy score of time duration fields 
confusion_crowd_gold_2(crowd_extractions, gold_extractions, 'share_duration', 'share_duration')
print "---"*10

crowd column is share_duration, gold column is share_duration 
gold extraction
[7.0, -1.0, -1.0, 0.125, 182.0, -1.0, 0.25, -1.0, -1.0, -1.0, -1.0, -1.0, 35.0, 84.0, -1.0, 120.0, 70.0, -1.0, 30.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 360.0, 84.0, 5.0, 105.0, 120.0, -1.0, 168.0, 28.0, 84.0, 14.0, 360.0, -1.0, 1825.0, 140.0, 378.0, -1.0, -1.0, -1.0, 42.0, -1.0, 1740.0, -1.0, -1.0, 84.0, 84.0, -1.0, 360.0, -1.0, -1.0, 168.0, -1.0, -1.0, 42.0, -1.0, -1.0, -1.0, -1.0, 42.0, -1.0, -1.0, 210.0, 90.0, 360.0, -1.0, -1.0, 1825.0, 720.0]

crowd extraction
[-1, -1, 1825.0, 1.0, 392.0, 15.0, -1, 297.0, -1, 960.0, 0.25, -1, 35.0, 84.0, -1, 120.0, 70.0, 730.0, 30.0, -1, 1898.0, 24.0, -1, -1, -1, 360.0, 84.0, 5.0, 105.0, 56.0, -1, 168.0, 14.0, 84.0, 14.0, 360.0, -1.0, 1825.0, 140.0, 378.0, -1, 365.0, -1, 42.0, 180.0, -1, 180.0, 730.0, 84.0, 84.0, -1, 360.0, 730.0, -1, 168.0, 0.006944444444444445, -1, 42.0, -1, 24.0, -1, -1, 42.0, -1, 63.0, 168.0, 90.0, 360.0, 24.0, 730.0, 1825.0, 720.0]

accuracy is 0.6