In [10]:
import time
import statistics
import numpy as np
import pandas
from scipy import stats



In [11]:
training_data = pandas.read_csv('dataset/Training_Data.csv')
testing_data = pandas.read_csv('dataset/Data.csv')

In [2]:
def parallel_execution():
    # Engine initialization
    c = Client()
    v = c[:]
    v.block = True
    
    # Load dataset
    testing_data = pandas.read_csv('datasets/InnoCentive_9933623_Data.csv')
    
    # Run brute-force parallelly
    # Parameters: Dataframe, Start Dataset, End Dataset, Rule length
    # Return: Subgroup, Best Rules
    params = []
    for i in range(8):
        params.append((testing_data, (i*150)+1, (i+1)*150, 1))
    return v.map(b.brute_force, params)

# Calculate the execution time
start_time = time.time()
results = parallel_execution()
print("\n--- {} seconds ---\n".format(time.time() - start_time))


--- 61027.33400917053 seconds ---



In [334]:
# Merge the results
def merge_results(results):    
    final_result = results[0][0]['group'].copy()
    for i in range(8):
        for j in range(288000):
            if results[i][0]['group'][j] == 1:
                final_result[j] = 1
    return final_result

# Export the result as csv
def save_to_csv(result):
    df = pandas.DataFrame([], columns=(['id'] + ['dataset_{}'.format(i) for i in range(1, 1201)]))
    c = 0
    for idx in range(1, 241):
        df.loc[idx-1, 'id'] = idx
        
        print(idx)

        for did in range(1, 1201):
            df.loc[idx-1, 'dataset_{}'.format(did)] = result[c]
            c += 1
    
    df.to_csv('result_bf_1.csv')

# save_to_csv(merge_results(results))
# save_to_csv(final_result)

In [25]:
# Configuration
P_VALUE_THRESHOLD = 0.05
TRT_EFFECT_THRESHOLD = -0.6

def evaluation(d):  
    # Subgroup
    subgroup = d[d['group'] == 1]
    new_trt_group = subgroup[subgroup['trt'] == 1]
    old_trt_group = subgroup[subgroup['trt'] == 0]
    
    if len(new_trt_group) == 0 or len(old_trt_group) == 0:
        return 0
    
    trt_effect = statistics.mean(new_trt_group['y']) - statistics.mean(old_trt_group['y'])
    number = len(new_trt_group) + len(old_trt_group)
    z_stat, p_val = stats.ranksums(new_trt_group['y'], old_trt_group['y'])
    
    # Nongroup
    nongroup = d[d['group'] == 0]
    new_trt_group = nongroup[nongroup['trt'] == 1]
    old_trt_group = nongroup[nongroup['trt'] == 0]
    
    if len(new_trt_group) == 0 or len(old_trt_group) == 0:
        return 0
    
    nongroup_mean = statistics.mean(new_trt_group['y']) - statistics.mean(old_trt_group['y'])
    minus = (trt_effect-nongroup_mean) if nongroup_mean != 0 else 0
    ratio = (trt_effect/nongroup_mean) if nongroup_mean != 0 else 0
    z_stat, non_p_val = stats.ranksums(new_trt_group['y'], old_trt_group['y'])
   
    # Filter
    if trt_effect > TRT_EFFECT_THRESHOLD: 
        return 0
    elif p_val > P_VALUE_THRESHOLD:
        return 0
    
    # Number of people restriction
    if number < 50 or number > 150:
        return 0
    
    return ratio, minus, trt_effect, nongroup_mean, number, p_val, non_p_val

def update_topN(score, rule, topN_score, case):
    
    if score == 0:    #why??
        return topN_score
    
    if case == 1:     # 1 = abs higher would be better
        s = abs(score)
    elif case == 2:   # 2 = close to 100 would be better
        s = abs(score - 100)
    else:             
        s = score
    topN_score.sort()  
    
    if len(topN_score)>=10 :
        
        key = []
        if (case == 1 | case == 4):    #higher would be better
            if s > topN_score[0][0]:    
                key = topN_score[0]
           
        else:       
            topN_score.reverse()    #lower score would be better
            if s < topN_score[0][0]:
                key = topN_score[0]
        
        if key in topN_score:
            topN_score.remove(key)
        else:
            return topN_score
        
    topN_score.append([s, score, rule])
    
    return topN_score
    

# Grouping method
def check_group(patient, rule):
    
    index, values, tag = rule
    
    if tag == 0:
        for v in values:
            if patient[index] == v:
                return 1
        return 0

    elif tag == 1:
        if patient[index] > values[0]:
            return 1
    else:
        if patient[index] < values[0]:
            return 1
    return 0
   

def quartile(d):
    Q1 = np.percentile(d, 25)
    Q2 = np.percentile(d, 50)
    Q3 = np.percentile(d, 75)
    return [(Q1), (Q2), (Q3)]


def possible_rules(d):
    con_items = range(4, 24)
    dis_items = range(24, 44)
    dis_value =[[0], [1], [2], [0, 1], [0, 2], [1, 2]]
    rules = []

    for item in con_items:
        for value in dis_value:
            rules.append([(item), value, 0])
            
    for item in dis_items:
        con_values = quartile(d['x'+str(item-3)])
        for value in con_values:
            rules.append([(item), [value], 1])
            rules.append([(item), [value], 2])
    rules.append([(32), [(51.3)], 2])

    return rules  

            
def return_key(item):
    if item[2] == 0:
        symbol = '='
        key = 'x{} {} {}'.format((item[0]-3), symbol, item[1][0])
            
    else:
        if item[2] == 1:
            symbol = '>'
        elif item[2] == 2:
            symbol = '<'
        key = 'x{} {} {:.3f}'.format((item[0]-3), symbol, item[1][0])
    return key

def rank_sum(dic, topN_score):
    topN_score.reverse()
    for i in range(0, len(topN_score)):
        key = return_key(topN_score[i][2])

        if key in dic:
            dic[key] = dic[key] + (i+1)
        else:
            dic[key] = (i+1)
    return dic

def print_all(save_all):
    print('rule \t\t ratio \t   minus    trt_effec  nongroup_mean   number      p_val  non_p_val')
    for i in save_all:
        ratio, minus, trt_effect,nongroup_mean, number, p_val, non_p_val = i[1]
        print('{:12s}: {:10.3f} {:10.3f}  {:10.3f} {:10.3f} {:10.3f} {:10.3f} {:10.3f}'.format(return_key(i[0]), ratio, minus, trt_effect,nongroup_mean, number, p_val, non_p_val))

def write(topN_score):
    for item in topN_score:
        key = return_key(item[2])
        print('{} : {:.5f}'.format(key, item[1]))
    
def analysis(t):
    df, start_dataset, end_dataset, rule_len = t
    
    # Set default group number to every patients
    if 'group' not in df:
        df['group'] = 0
    
    for did in range(start_dataset, end_dataset+1):
        
        #initial
        topN_Minus = []
        topN_Ratio = []
        topN_TreatEffect = []
        topN_Ungroup_TreatEffect = []
        topN_Number = []
        topN_P_value = []
        save_all = []
        r_dic = {}
        rank = []
        
        print('Dataset {}'.format(did))
        d = df[df['dataset'] == did].copy()

        for rule in possible_rules(d):
            d = df[df['dataset'] == did].copy()
            
            for i in range(1, 241):
                patient = d[d['id'] == i].iloc[0] 
                g = check_group(patient, rule)
                d.loc[d['id'] == i, 'group'] = g
            
            # Evaluate the treatment effect according each rule
            parts = evaluation(d)
            if type(parts) == int:
                continue

            # Caculate the socre
            ratio, minus, trt_effect,nongroup_mean, number, p_val, non_p_val = parts
            save_all.append([rule, parts])
            
            
            topN_Ratio = update_topN(ratio, rule, topN_Ratio, 1)
            topN_Ungroup_TreatEffect = update_topN(trt_effect, rule, topN_Ungroup_TreatEffect, 4)
            topN_Minus = update_topN(minus, rule, topN_Minus, 3)
            topN_TreatEffect = update_topN(trt_effect, rule, topN_TreatEffect, 3)  
            topN_P_value = update_topN(p_val, rule, topN_P_value, 3)
            #topN_Number = update_topN(number, rule, topN_Number, 2)
            
        topN_Minus.sort()
        topN_TreatEffect.sort()
        topN_Ratio.sort()
        topN_P_value.sort()
        topN_Ratio.reverse()   #bigger will be better
        topN_Ungroup_TreatEffect.sort()
        topN_Ungroup_TreatEffect.reverse()  
        #topN_Number.sort()
        
        #r_dic = rank_sum(r_dic, topN_Ratio.copy)
        r_dic = rank_sum(r_dic, topN_Minus.copy())
        r_dic = rank_sum(r_dic, topN_TreatEffect.copy())
        r_dic = rank_sum(r_dic, topN_Ungroup_TreatEffect.copy())
        
        for k, v in r_dic.items():
            rank.append((k, v))
        rank = sorted(rank, key=lambda a:a[1], reverse=True)
        
        
        #print('Top 10 Ratio 絕對值越高越好')   
        #write(topN_Ratio)  
        #print('\nTop 10 Minus 越低越好')  
        #write(topN_Minus)
        #print('\nTop 10 Treatment Effect 越低越好')      
        #write(topN_TreatEffect)
        #print('\nTop 10 P-Value 越低越好')      
        #write(topN_P_value)
        #print('\nTop 10 Number 越接近100越好')      
        #write(topN_Number) 
        
        
        print('\nRank Sum')
        for i in range(0, len(rank)):
            print(rank[i])
        print('\n')
        
        print_all(save_all)
    return save_all

In [26]:
params = (training_data, 2, 2, 1)

start_time = time.time()
save_all = analysis(params)
print("\n--- {} seconds ---\n".format(time.time() - start_time))
#跑一個dataset大概120s

Dataset 2
Top 10 Ratio 絕對值越高越好
x29 < 48.620 : -86.03349
x40 > 53.800 : -16.60850
x21 < 48.715 : 11.07797
x11 = 1 : 8.90867
x38 < 47.670 : 8.83044
x10 = 0 : 7.60372
x24 > 78.550 : 6.04959
x31 < 30.205 : 5.51797
x37 < 30.473 : 4.98382
x6 = 1 : 4.48499

Top 10 Minus 越低越好
x24 > 78.550 : -0.74822
x31 < 30.205 : -0.72604
x29 < 48.620 : -0.71476
x37 < 30.473 : -0.65575
x29 < 51.300 : -0.65388
x31 < 52.570 : -0.65291
x40 > 53.800 : -0.64251
x11 = 1 : -0.61240
x21 < 48.715 : -0.59444
x38 < 47.670 : -0.55261

Top 10 Treatment Effect 越低越好
x24 > 78.550 : -0.89640
x31 < 30.205 : -0.88674
x37 < 30.473 : -0.82036
x29 < 48.620 : -0.70655
x11 = 1 : -0.68984
x21 < 48.715 : -0.65342
x29 < 51.300 : -0.65292
x6 = 1 : -0.65084
x31 < 52.570 : -0.65036
x38 < 47.670 : -0.62318

Rank Sum
('x24 > 78.550', 21)
('x31 < 30.205', 20)
('x29 < 48.620', 19)
('x37 < 30.473', 18)
('x29 < 51.300', 17)
('x31 < 52.570', 16)
('x11 = 1', 14)
('x21 < 48.715', 13)
('x38 < 47.670', 12)
('x6 = 1', 11)
('x40 > 53.800', 4)


rule 	