In [195]:
import time
import statistics

import pandas
from scipy import stats
from ipyparallel import Client

import subgroup_analysis.brute_force as b

In [196]:
training_data = pandas.read_csv('datasets/InnoCentive_9933623_Training_Data.csv')

In [2]:
def parallel_execution():
    # Engine initialization
    c = Client()
    v = c[:]
    v.block = True
    
    # Load dataset
    testing_data = pandas.read_csv('datasets/InnoCentive_9933623_Data.csv')
    
    # Run brute-force parallelly
    # Parameters: Dataframe, Start Dataset, End Dataset, Rule length
    # Return: Subgroup, Best Rules
    params = []
    for i in range(8):
        params.append((testing_data, (i*150)+1, (i+1)*150, 1))
    return v.map(b.brute_force, params)

# Calculate the execution time
start_time = time.time()
results = parallel_execution()
print("\n--- {} seconds ---\n".format(time.time() - start_time))


--- 61027.33400917053 seconds ---



In [35]:
# coding: utf-8
import time
import math
import statistics
    
import pandas
import numpy as np
from scipy import stats

# Configuration
P_VALUE_THRESHOLD = 0.05
TRT_EFFECT_THRESHOLD = -0.6

# Rule generation functions
def repeated_selections(items, n):
    if n == 0: yield []
    else:
        for i in range(len(items)):
            for r in repeated_selections(items, n-1):
                yield [items[i]] + r

def combinations(items, n):
    if n == 0: yield []
    else:
        for i in range(len(items)):
            for c in combinations(items[i+1:], n-1):
                yield [items[i]] + c

def possible_rules(max_rule_len):
    items = range(4, 44)
    dis_values = [(0), (1), (2), (0, 1), (1, 2)] # Possible discrete values
    con_values = [(25), (-25), (50), (-50), (75), (-75)] # Possible continuous values

    for rule_len in range(1, max_rule_len+1):
        for c in combinations(items, rule_len):
            con_param_num = 0 # Count the number of continuous parameters
            for e in c:
                if e > 23:
                    con_param_num += 1
            dis_param_num = rule_len-con_param_num

            # Generate all possible rules
            for dv in repeated_selections(dis_values, dis_param_num):
                for cv in repeated_selections(con_values, con_param_num):
                    yield (c, dv+cv)

# Grouping method
def check_group(patient, rule):
    indexes, values = rule

    achieve_num = 0
    for i in range(len(indexes)):
        idx = indexes[i]
        val = values[i]

        # Discrete rule
        if type(val) == tuple:
            if patient[idx] in val:
                achieve_num += 1

        # Discrete rule
        elif math.fabs(val) < 3:
            if patient[idx] == val:
                achieve_num += 1

        # Continuous rule
        else:
            if val > 0:
                if patient[idx] > val:
                    achieve_num += 1
            else:
                if patient[idx] < -val:
                    achieve_num += 1
    
    if achieve_num == len(indexes):
        return 1

    return 0

def evaluation(d):
    
    # Subgroup
    subgroup = d[d['group'] == 1]
    new_trt_group = subgroup[subgroup['trt'] == 1]
    old_trt_group = subgroup[subgroup['trt'] == 0]
    
    if len(new_trt_group) == 0 or len(old_trt_group) == 0:
        return 0
    
    trt_effect = statistics.mean(new_trt_group['y']) - statistics.mean(old_trt_group['y'])
    number = len(new_trt_group) + len(old_trt_group)
    z_stat, p_val = stats.ranksums(new_trt_group['y'], old_trt_group['y'])
    
    # Nongroup
    nongroup = d[d['group'] == 0]
    new_trt_group = nongroup[nongroup['trt'] == 1]
    old_trt_group = nongroup[nongroup['trt'] == 0]
    
    if len(new_trt_group) == 0 or len(old_trt_group) == 0:
        return 0
    
    nongroup_mean = statistics.mean(new_trt_group['y']) - statistics.mean(old_trt_group['y'])
    
    # Filter
    if trt_effect > TRT_EFFECT_THRESHOLD:
        return 0
    elif p_val > P_VALUE_THRESHOLD:
        return 0
    
    r = (trt_effect/nongroup_mean) if nongroup_mean != 0 else 0
    
    return r, trt_effect, number, p_val

def h0_filtering(d):
    
    trt_1 = d[d['trt'] == 1]
    trt_0 = d[d['trt'] == 0]
    
    trt_effect = statistics.mean(trt_1['y']) - statistics.mean(trt_0['y'])
    z_stat, p_val = stats.ranksums(trt_1['y'], trt_0['y'])
    
    if trt_effect > TRT_EFFECT_THRESHOLD:
        return 0
    elif p_val > P_VALUE_THRESHOLD:
        return 0
    return 1

def update_best_score(score, best_score):
    if score == 0:
        return False
    
    if score < 0 and score < best_score:
        return True
    
    if score > 0 and best_score >= 0 and score > best_score:
        return True

def brute_force(t):
    # Load parameters
    df, start_dataset, end_dataset, rule_len = t

    # Set default group number to every patients
    if 'group' not in df:
        df['group'] = 0

    # Record the best rule for each dataset
    best_rules = []

    for did in range(start_dataset, end_dataset+1):
        d = df[df['dataset'] == did].copy()
        if h0_filtering(d) == 1:
            continue
        
        # Iteratively get one of the possible rules
        best_score = 0
        best_rule = None
        for rule in possible_rules(rule_len):
            d = df[df['dataset'] == did].copy()                

            for i in range(1, 241):
                patient = d[d['id'] == i].iloc[0]

                g = check_group(patient, rule)
                d.loc[d['id'] == i, 'group'] = g

            # Evaluate the treatment effect according each rule
            parts = evaluation(d)
            if type(parts) == int:
                continue

            # Caculate the socre
            ratio, trt_effect, number, p_val = parts
            score = ratio
            
            # Compare with the absolute value
            if update_best_score(score, best_score):
                best_score = score
                best_rule = rule
                
                print(['x{}'.format(tmp-3) for tmp in rule[0]], rule[1], ratio, trt_effect, number, p_val)

        # Record the best rule for this dataset
        if best_score != 0:
            best_rules.append((did, best_score, best_rule))
        else:
            best_rules.append((did, 0, None))

        # Dvide the patients into groups according to the best rule
        for i in range(1, 241):
            patient = df[(df['dataset'] == did) & (df['id'] == i)].iloc[0]

            if best_rule is None:
                g = 0
            else:
                g = check_group(patient, best_rule)

            df.loc[(df['dataset'] == did) & (df['id'] == i), 'group'] = g

    return df, best_rules


In [3]:
training_data = pandas.read_csv('datasets/InnoCentive_9933623_Training_Data.csv')
testing_data = pandas.read_csv('datasets/InnoCentive_9933623_Data.csv')

In [339]:
params = (testing_data, 1, 1, 2)

print('Parameters  Rules  Ratio  Effect  Number  P-value')

start_time = time.time()
result = brute_force(params)
print("\n--- {} seconds ---\n".format(time.time() - start_time))

Parameters  Rules  Ratio  Effect  Number  P-value
['x1'] [0] 1.50891093644 -0.8446 126 0.000646634419885
['x2'] [0] 1.67818398229 -0.903389304813 118 0.00111906558816
['x3'] [2] 2.13520665956 -1.35616666667 27 0.0396498797202
['x4'] [(0, 1)] 2.18650353195 -0.753670948204 205 7.19680454857e-05
['x5'] [(0, 1)] 11.9407096127 -0.789066584967 212 8.27927942098e-05
['x18'] [(0, 1)] -3.2644949679 -0.831466868324 220 1.63948793868e-05
['x1', 'x4'] [0, (0, 1)] -9.58129638733 -0.769894015862 222 3.78380694771e-05
['x1', 'x8'] [0, (1, 2)] -9.81863334113 -0.84319667026 190 1.46258966153e-05
['x1', 'x14'] [0, 0] -10.6909229144 -0.941172826087 179 5.95409478948e-06
['x1', 'x15'] [1, (0, 1)] -32.8388538462 -0.729752307692 228 7.24517813825e-05
['x1', 'x15'] [2, (0, 1)] -143.7962784 -0.762120275521 225 3.8122239236e-05


KeyboardInterrupt: 

In [334]:
# Merge the results
def merge_results(results):    
    final_result = results[0][0]['group'].copy()
    for i in range(8):
        for j in range(288000):
            if results[i][0]['group'][j] == 1:
                final_result[j] = 1
    return final_result

# Export the result as csv
def save_to_csv(result):
    df = pandas.DataFrame([], columns=(['id'] + ['dataset_{}'.format(i) for i in range(1, 1201)]))
    c = 0
    for idx in range(1, 241):
        df.loc[idx-1, 'id'] = idx
        
        print(idx)

        for did in range(1, 1201):
            df.loc[idx-1, 'dataset_{}'.format(did)] = result[c]
            c += 1
    
    df.to_csv('result_bf_1.csv')

# save_to_csv(merge_results(results))
# save_to_csv(final_result)

In [79]:
def generate_testing_result(did, rule):
    df = pandas.read_csv('datasets/InnoCentive_9933623_Data.csv')
    df['group'] = 0
    
    for i in range(1, 241):
        patient = df[(df['dataset'] == did) & (df['id'] == i)].iloc[0]

        if rule is None:
            g = 0
        else:
            g = check_group(patient, rule)

        df.loc[(df['dataset'] == did) & (df['id'] == i), 'group'] = g
    
    return df

def save_testing_csv(result, dataset, name='default'):
    df = pandas.DataFrame([], columns=(['id'] + ['dataset_{}'.format(i) for i in range(1, 1201)]))
    for idx in range(1, 241):
        df.loc[idx-1, 'id'] = idx

        for did in range(1, 1201):
            if dataset != did:
                v = 0
            else:
                tmp = result[(result['dataset'] == did) & (result['id'] == idx)]
                print(tmp)
                v = tmp.loc[0, 'group']

            df.loc[idx-1, 'dataset_{}'.format(did)] = v
    
    # df.to_csv('result_bf_{}.csv'.format(name))
    