In [14]:
import pandas as pd
import numpy as np
import os

In [2]:
'''
Run MGM
Note: MGM was implemented in Java and the following Python APIs call the Java implementation.
Please restart the Python program after encountering a JVM problem.
The input data file should be ".txt" format and should also include the response variables.
'''
from MGM.MGM import MGM
mgm = MGM();
mgm_output_file = mgm.runMGM( "/ihome/hpark/zhf16/causalDeepVASE", "X_n1000_p50_rep20.txt",lambda_continuous_continuous = 0.3, lamda_continuous_discrete = 0.3, lamda_discrete_discrete = 0.3);
print("Please find MGM's output file as:");
print(mgm_output_file);

/ihome/hpark/zhf16/causalDeepVASE/MGM/tetradLite.jar
Please find MGM's output file as:
X_n1000_p50_rep20_MGM_associations.csv


In [7]:
'''
#Generate knockoff data using one of three methods: Omega, DNN, Cholesky_LU
#Recommended: Omega or Cholesky_LU
'''
from DL.knockoff.KnockoffGenerator import KnockoffGenerator;
generator = KnockoffGenerator();
file_path = generator.Chol_Lu_knockoff("/ihome/hpark/zhf16/causalDeepVASE", "X_n1000_p50_rep20.csv");
print("The newly generated knockoff file is named as:")
print(file_path);

Using TensorFlow backend.


__init__
(50, 50)
(1000, 50)
['K1', 'K2', 'K3', 'K4', 'K5', 'K6', 'K7', 'K8', 'K9', 'K10', 'K11', 'K12', 'K13', 'K14', 'K15', 'K16', 'K17', 'K18', 'K19', 'K20', 'K21', 'K22', 'K23', 'K24', 'K25', 'K26', 'K27', 'K28', 'K29', 'K30', 'K31', 'K32', 'K33', 'K34', 'K35', 'K36', 'K37', 'K38', 'K39', 'K40', 'K41', 'K42', 'K43', 'K44', 'K45', 'K46', 'K47', 'K48', 'K49', 'K50']
The newly generated knockoff file is named as:
/ihome/hpark/zhf16/causalDeepVASE/X_n1000_p50_rep20_chol_lu_knockoff.csv


In [8]:
''''''
# After generating the knockoff data, run DNN
file_name = 'X_n1000_p50_rep20_chol_lu_knockoff.csv';
X_knockoff_data = pd.read_csv(file_name);
# X_knockoff_data

#nutrient_data
original_data_Y = pd.read_csv('y_si_n1000_p50_rep20.csv');
# original_data_Y

X_values = X_knockoff_data.values;
Y_values = original_data_Y.values;
    
pVal = int(X_values.shape[1] / 2);
n = X_values.shape[0];
print(X_values.shape);
print(Y_values.shape);
print(pVal);
    
X_origin = X_values[:, 0:pVal];
X_knockoff = X_values[:, pVal:];

x3D_train = np.zeros((n, pVal, 2));
x3D_train[:, :, 0] = X_origin;
x3D_train[:, :, 1] = X_knockoff;
label_train = Y_values;
    
coeff = 0.05 * np.sqrt(2.0 * np.log(pVal) / n);

n_outputs = original_data_Y.shape[1];

#Save the DNN output to the following directory.
result_dir = 'result/';
if not os.path.exists(result_dir):
    os.makedirs(result_dir);
    
from DL.DNN.DNN import DNN;
dnn = DNN();
model = dnn.build_DNN(pVal, n_outputs, coeff);
callback = DNN.Job_finish_Callback(result_dir,pVal);
dnn.train_DNN(model, x3D_train, label_train,callback);

(1000, 100)
(1000, 1)
50
__init__parameters
[layer]: Input	[shape]: [None, 50, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 50, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 50, 1] 

[layer]: Flatten	[shape]: [None, None] 

[layer]: Dense	[shape]: [None, 50] 

[layer]: Dense	[shape]: [None, 50] 

[layer]: Dense	[shape]: [None, 1] 

Epoch 1/20
on_epoch_end
h_local1_weight = (50, 2, 1)
h_local2_weight = (50, 1, 1)
h0 = (50, 2)
h0_abs = (50, 2)
h1 = (50, 50)
h2 = (50, 50)
h3 = (50, 1)
W1 = (50, 50)
W2 = (50, 50)
W3 = (50, 1)
Epoch 2/20
on_epoch_end
h_local1_weight = (50, 2, 1)
h_local2_weight = (50, 1, 1)
h0 = (50, 2)
h0_abs = (50, 2)
h1 = (50, 50)
h2 = (50, 50)
h3 = (50, 1)
W1 = (50, 50)
W2 = (50, 50)
W3 = (50, 1)
Epoch 3/20
on_epoch_end
h_local1_weight = (50, 2, 1)
h_local2_weight = (50, 1, 1)
h0 = (50, 2)
h0_abs = (50, 2)
h1 = (50, 50)
h2 = (50, 50)
h3 = (50, 1)
W1 = (50, 50)
W2 = (50, 50)
W3 = (50, 1)
Epoch 4/20
on_epoch_end
h_local1_weight = (50, 2, 1)
h_local2_weight = (50, 1

In [9]:
#Apply FDR control to DNN result
from DL.FDR.FDR_control import FDR_control;
control = FDR_control();
selected_features = control.controlFilter("/ihome/hpark/zhf16/causalDeepVASE/X_n1000_p50_rep20.csv", "/ihome/hpark/zhf16/causalDeepVASE/result", offset=1, q=0.05);
#Save the selected associations
selected_associations = [];
for ele in selected_features:
    selected_associations.append({"Feature1":ele,"Feature2":"Y"});
pd.DataFrame(selected_associations).to_csv("DNN_selected_associations.csv")

__init__
100


In [11]:
#Run DG
#Load data
X_data = pd.read_csv("X_n1000_p50_rep20.csv");
# X_data
Y_data = pd.read_csv('y_si_n1000_p50_rep20.csv');
#Merge X and Y
dataset = pd.concat([X_data, Y_data], axis=1, join='inner');
print(dataset.shape);

#Calculate the covariance matrix
cov_mat = dataset.cov();
corr_inv = np.linalg.inv(cov_mat)
corr_inv = pd.DataFrame(data=corr_inv, index=cov_mat.index,columns=cov_mat.columns)
# corr_inv.head(2)

#Convert the columns to their numerical representations
col_map = {};
col_map_rev = {};
col_list = dataset.columns.to_list();
for index,ele in enumerate(col_list):
    col_map[ele] = index;
    col_map_rev[index] = ele;
print(dataset.shape);

#https://stats.stackexchange.com/questions/13810/threshold-for-correlation-coefficient-to-indicate-statistical-significance-of-a
# t = dataset.shape[0]**(1/2)

#The data may need to be normalized if neccessary.
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler();
# scaled_values = scaler.fit_transform(dataset);
# dataset.loc[:,:] = scaled_values;

#Initialize DG object
from causal.DegenerateGaussianScore import DegenerateGaussianScore
dg = DegenerateGaussianScore(dataset,discrete_threshold=0.2);

(1000, 51)
(1000, 51)


In [15]:
selected_associations_sum = [];
#Load both MGM-identified and DNN associations
MGM_associations = pd.read_csv("X_n1000_p50_rep20_MGM_associations.csv");
for index,row in MGM_associations.iterrows():
    if row["Feature1"]=="Y" or row["Feature2"]=="Y":
        print("Found.");
        selected_associations_sum.append({"Feature1":row["Feature1"],"Feature2":row["Feature2"]});
        
DNN_associations = pd.read_csv("DNN_selected_associations.csv");
for index,row in DNN_associations.iterrows():
    selected_associations_sum.append({"Feature1":row["Feature1"],"Feature2":row["Feature2"]});

Found.
Found.
Found.
Found.


In [18]:
for ele in selected_associations_sum:
    f1 = ele["Feature1"];
    f2 = ele["Feature2"];
    
    inv_val = abs(corr_inv[f1][f2]);
    if inv_val<0.0:
        continue;
    
    n1_idx = col_map[f1];
    n2_idx = col_map[f2];
    
    s1 = dg.localScore(n1_idx,{n2_idx});
    s2 = dg.localScore(n2_idx,{n1_idx});
    
    if s1<s2:
        print("Cause: "+f2+", Effect: "+f1);
    elif s1>s2:
        print("Cause: "+f1+", Effect: "+f2);
    else:
        print("Same score.");

Cause: F1, Effect: Y
Cause: F2, Effect: Y
Cause: F3, Effect: Y
Cause: F4, Effect: Y
Cause: F1, Effect: Y
Cause: F2, Effect: Y
Cause: F3, Effect: Y
Cause: F4, Effect: Y
Cause: F5, Effect: Y
Cause: F6, Effect: Y
Cause: F7, Effect: Y
Cause: F8, Effect: Y
Cause: F9, Effect: Y
Cause: F10, Effect: Y
Cause: F11, Effect: Y
Cause: F12, Effect: Y
Cause: F13, Effect: Y
Cause: F14, Effect: Y
Cause: F15, Effect: Y
Cause: F16, Effect: Y
Cause: F19, Effect: Y
Cause: F22, Effect: Y
Cause: F23, Effect: Y
Cause: F24, Effect: Y
Cause: F27, Effect: Y
Cause: F29, Effect: Y
Cause: F30, Effect: Y
Cause: F31, Effect: Y
Cause: F33, Effect: Y
Cause: F34, Effect: Y
Cause: F37, Effect: Y
Cause: F38, Effect: Y
Cause: F40, Effect: Y
Cause: F43, Effect: Y
Cause: F44, Effect: Y
Cause: F47, Effect: Y
Cause: F49, Effect: Y
Cause: F50, Effect: Y
