# Run the whole method on multiple repetitions of simulation data

In [1]:
import pandas as pd
import numpy as np
import os
import shutil

#from DL.knockoff.KnockoffGenerator import KnockoffGenerator;

# Run MGM to identify linear associations

In [2]:
from MGM.MGM import MGM
mgm = MGM();

def runMGM(n,p,iters):
    
    dirPath = "data/simulated_data/nonlinear/"+str(n)+"samples/"+str(p)+"p/";
    resultPath = "data/simulated_data/nonlinear/";

    XFileName = "X_n"+str(n)+"_p"+str(p)+"_iter"+str(iterIdx)+".csv";
    YFileName = "y_si_n"+str(n)+"_p"+str(p)+"_iter"+str(iterIdx)+".csv";

    XDataDF = pd.read_csv(dirPath+XFileName,sep='\t');
    YDataDF = pd.read_csv(dirPath+YFileName,sep='\t');
    YDataDF = YDataDF.rename(columns={"V1": "Y"});
    dataDF = pd.concat([XDataDF,YDataDF],axis=1);

    XYFileName = "p"+str(p)+"_iter"+str(iterIdx)+"_XYData.txt";
    dataDFPath = resultPath+XYFileName;
    dataDF.to_csv(dataDFPath,index=None, sep="\t");
    
    mgm_output_file = mgm.runMGM(resultPath, XYFileName,lambda_continuous_continuous = 0.3, lamda_continuous_discrete = 0.3, lamda_discrete_discrete = 0.3);
    print("Please find MGM's output file as:");
    mgm_output_file_path = resultPath+os.path.sep+mgm_output_file[0];
    print(mgm_output_file_path);


/ihome/hpark/zhf16/causalDeepVASE/MGM/tetradLite_likelihood_for_all.jar


In [3]:
sampleNumbers = [10000];
pNumbers = [40];
#iterIdxs = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
iterIdxs = [5];
for sampleNumber in sampleNumbers:
    for pNumber in pNumbers:
        for iterIdx in iterIdxs:
            runMGM(sampleNumber,pNumber,iterIdx);
#mgm.showDownJVM();
#dataDF

Please find MGM's output file as:
data/simulated_data/nonlinear//p40_iter5_XYData_MGM_associations.csv


# Run the DNN to identify nonlinear associations

In [None]:
import pandas as pd
import numpy as np
import os
import shutil

from DL.knockoff.KnockoffGenerator import KnockoffGenerator;

In [None]:
from DL.DNN.DNN import DNN;
from DL.FDR.FDR_control import FDR_control;

def runProcedure(n,p,iterIdx):
    dirPath = "data/simulated_data/nonlinear/";
    dataDirPath = dirPath+str(n)+"samples/"+str(p)+"p/";
    #result_dir = "data/simulated_data/nonlinear/"+str(n)+"samples/"+str(p)+"p/DNN_result/";
    result_dir = dirPath+"DNN_result/";
    if os.path.exists(result_dir):
        shutil.rmtree(result_dir);

    XFileName = "X_n"+str(n)+"_p"+str(p)+"_iter"+str(iterIdx)+".csv";
    YFileName = "y_si_n"+str(n)+"_p"+str(p)+"_iter"+str(iterIdx)+".csv";
    
    generator = KnockoffGenerator();
    #knockoff_file_path = generator.DNN_knockoff(dataDirPath, XFileName,YFileName);
    #knockoff_file_path = generator.ISEEKnockoff(dataDirPath, XFileName);
    knockoff_file_path = generator.CholLuKnockoff(dataDirPath, XFileName);

    print("The newly generated knockoff file is named as:")
    print(knockoff_file_path);
    
    # After generating the knockoff data, run DNN
    X_knockoff_data = pd.read_csv(knockoff_file_path,sep='\t');
    print(X_knockoff_data.shape);

    #Y_data
    original_data_Y = pd.read_csv(os.path.join(dataDirPath,YFileName));

    X_values = X_knockoff_data.values;
    Y_values = original_data_Y.values;
    
    pNum = int(X_values.shape[1] / 2);
    nNum = X_values.shape[0];
    print(X_values.shape);
    print(Y_values.shape);
    print(pNum);
    
    X_origin = X_values[:, 0:pNum];
    X_knockoff = X_values[:, pNum:];

    x3D_train = np.zeros((nNum, pNum, 2));
    x3D_train[:, :, 0] = X_origin;
    x3D_train[:, :, 1] = X_knockoff;
    label_train = Y_values;
    
    coeff = 0.05 * np.sqrt(2.0 * np.log(pNum) / nNum);
    n_outputs = original_data_Y.shape[1];

    #Save the DNN output to the following directory.

    if not os.path.exists(result_dir):
        os.makedirs(result_dir);
    
    dnn = DNN();
    model = dnn.build_DNN(pNum, n_outputs, coeff);
    callback = DNN.Job_finish_Callback(result_dir,pNum);
    dnn.train_DNN(model, x3D_train, label_train,callback);
    
    #Apply FDR control to DNN result
    control = FDR_control();
    selected_features = control.controlFilter(dataDirPath +os.path.sep+ XFileName, result_dir, offset=1, q=0.05);
    #Save the selected associations
    selected_associations = [];
    for ele in selected_features:
        selected_associations.append({"Feature1":ele[0],"Feature2":"Y","Stat":ele[1]});
    pd.DataFrame(selected_associations).to_csv(dirPath+"DNN_selected_associations_"+str(p)+"p_"+str(iterIdx)+"iter.csv")
    #Delete knockoff file if exists
    if os.path.exists(knockoff_file_path):
        os.remove(knockoff_file_path);
#temp_df

In [None]:
sampleNumbers = [10000];
pNumbers = [40];
iterIdxs = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
#iterIdxs = [1];
for sampleNumber in sampleNumbers:
    for pNumber in pNumbers:
        for iterIdx in iterIdxs:
            runProcedure(sampleNumber,pNumber,iterIdx);

# Identify causal directions for all the identified associations

In [None]:
def removeCycles(causalGraph):
    #print(causalGraph.edges());
    cycyles = list(nx.simple_cycles(causalGraph));
    for cycle in cycyles:
        source_node = cycle[0];
        target_node_index = 1;
    
        marked_source_node = "";
        marked_target_node = "";
        marked_weight = 0;
    
        while (target_node_index<len(cycle)):
            target_node = cycle[target_node_index];
            weight = causalGraph.get_edge_data(source_node,target_node)['weight'];
            #print(weight);
            if (marked_source_node =="" and marked_target_node=="") or marked_weight<weight:
                marked_weight = weight;
                marked_source_node = source_node;
                marked_target_node = target_node;
            
            source_node = target_node;
            target_node_index=target_node_index+1;
        target_node = cycle[0];
        weight = causalGraph.get_edge_data(source_node,target_node)['weight'];
        #print(weight);
        if marked_weight<weight:
            marked_weight = weight;
            marked_source_node = source_node;
            marked_target_node = target_node;
        #Delete the node with smallest weight
        causalGraph.remove_edge(source_node,target_node);
    #print(causalGraph.edges());
    return causalGraph;

In [None]:
sampleNumbers = [200,600,1000];
pNumbers = [50,200];
causalResultDictionary = {};
for sampleNumber in sampleNumbers:
    for pNumber in pNumbers:
        meanAccuracyRate = processResultFile(sampleNumber,pNumber);
        if sampleNumber not in causalResultDictionary:
            causalResultDictionary[sampleNumber] = {pNumber:meanAccuracyRate};
        else:
            causalResultDictionary[sampleNumber][pNumber] = meanAccuracyRate;

In [None]:
pd.DataFrame(causalResultDictionary)