In [1]:
import dowhy
from dowhy import CausalModel
from causallearn.search.ConstraintBased.PC import pc
from causallearn.utils.GraphUtils import GraphUtils
from causallearn.utils.GraphUtils import GraphUtils
from causallearn.search.ScoreBased.GES import ges
from causallearn.search.FCMBased import lingam

from pathlib import Path
import os 

import numpy as np
import pandas as pd
import graphviz
import networkx as nx 

np.set_printoptions(precision=3, suppress=True)
np.random.seed(0)

  from .autonotebook import tqdm as notebook_tqdm


Convert matrix into DAG in dot format

In [None]:

def make_graph(adjacency_matrix, labels=None):
    idx = np.abs(adjacency_matrix) > 0.01
    dirs = np.where(idx)
    d = graphviz.Digraph(engine='dot')
    names = labels if labels else [f'x{i}' for i in range(len(adjacency_matrix))]
    for name in names:
        d.node(name)
    for to, from_, coef in zip(dirs[0], dirs[1], adjacency_matrix[idx]):
        d.edge(names[from_], names[to], label=str(coef))
    return d

def str_to_dot(string):
    '''
    Converts input string from graphviz library to valid DOT graph format.
    '''
    graph = string.strip().replace('\n', ';').replace('\t','')
    graph = graph[:9] + graph[10:-2] + graph[-1] # Removing unnecessary characters from string
    return graph

# DAG Generator

In [4]:
from IPython.display import display
from pandas import isna

def DAG_generator(path: str):
    """_summary_
        Input: path of csv: path/{NAME}.csv
        Output: 3 dot files: {NAME}_pc.dot, {NAME}_ges.dot, {NAME}_lingam.dot
    
    """
    ## Input check
    path = Path(path)
    assert os.path.isfile(path), f"{path} not a valid path"
    parent_dir = path.parent
    prefix, format_ = os.path.basename(path).split('.')
    assert format_ == 'csv', f"{prefix}.{format_} not a csv file" 

    df = pd.read_csv(path)
    df = df.drop(['Unnamed: 0'], axis=1, errors='ignore').to_numpy()
    cg = pc(df, indep_test='chisq')

    cg = pc(df, indep_test="mv_fisherz", mvpc=True)
    Record = ges(df, score_func="local_score_marginal_multi", )


    print(df[np.isnan(df.to_numpy()).any(axis=1)])

    # default parameters
    # # or customized parameters
    # cg = pc(df, alpha, indep_test, stable, uc_rule, uc_priority, mvpc, correction_name, background_knowledge, verbose, show_progress)

    # # visualization using pydot
    # cg.draw_pydot_graph()

    # # or save the graph

    # pyd = GraphUtils.to_pydot(cg.G)
    # pyd.write_png('simple_test.png')

DAG_generator("../../data/stackoverflow/so_countries_col_new.csv")

Depth=4, working on node 1:  10%|█         | 2/20 [00:00<00:00, 861.70it/s] 

KeyboardInterrupt: 

In [21]:
import matplotlib.pyplot as plt
path ="../../data/stackoverflow/so_countries_col_new.csv"
df = pd.read_csv(path)
df = df.drop(['Unnamed: 0'], axis=1, errors='ignore')
binned = pd.qcut(df['ConvertedSalary'], 4, labels=["lo", "md", "hi", "uh"])
df_ = df.__deepcopy__()
df_["ConvertedSalary"] = binned 

cg = pc(df_.to_numpy(), indep_test='chisq', stable=False)

Depth=10, working on node 19: 100%|██████████| 20/20 [00:04<00:00,  4.92it/s]


In [17]:
a=cg.to_nx_graph()
pyd = GraphUtils.to_pydot(cg.G)

In [18]:
import pygraphviz as pgv


In [45]:


path ="../../data/stackoverflow/so_countries_col_new.csv"
df = pd.read_csv(path)
df = df.drop(['Unnamed: 0'], axis=1, errors='ignore')


In [24]:
g = pgv.AGraph(directed=True)
g.layout(prog="dot")
attrs = df.columns
nodes=cg.G.get_nodes()
edges=cg.G.get_graph_edges()
def name_map(name):
    idx = str(name).split('X')[1]
    return attrs[int(idx)-1]
for j in nodes:
    g.add_node(name_map(j))
for i in edges:
    g.add_edge(name_map(i.get_node1()),name_map(i.get_node2()))
print(g.to_string())
g.write("file.dot")
g.draw("file.png")



strict digraph "" {
	graph [bb="0,0,0,0"];
	node [label="\N"];
	Country -> Continent;
	Country -> ConvertedSalary;
	Country -> HDI;
	Country -> GINI;
	Student -> Country;
	Student -> FormalEducation;
	Student -> Age;
	FormalEducation -> Country;
	FormalEducation -> YearsCoding;
	FormalEducation -> Age;
	FormalEducation -> ConvertedSalary;
	UndergradMajor -> FormalEducation;
	UndergradMajor -> DevType;
	UndergradMajor -> YearsCoding;
	UndergradMajor -> EducationParents;
	UndergradMajor -> Age;
	DevType -> Student;
	DevType -> FormalEducation;
	DevType -> YearsCoding;
	DevType -> HoursComputer;
	DevType -> Age;
	YearsCoding -> Country;
	YearsCoding -> Age;
	YearsCoding -> ConvertedSalary;
	HoursComputer -> Hobby;
	HoursComputer -> Country;
	HoursComputer -> Student;
	HoursComputer -> FormalEducation;
	HoursComputer -> YearsCoding;
	HoursComputer -> Age;
	HoursComputer -> ConvertedSalary;
	Exercise -> Country;
	Exercise -> HoursComputer;
	Exercise -> ConvertedSalary;
	Gender -> Hobby;
	Ge

In [81]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
def entry_with_col_name(col_name, entry):
    """Prefix an entry with it's column name, connected with '___'
    e.g:
        -------------------------
        | Age                   |
        | '18 - 24 years old'   | 
        -------------------------
        becomes 
        -----------------------------
        | Age                       |
        | 'Age___18 - 24 years old' | 
        -----------------------------
    
    """
    return f"{col_name}___{entry}"

df1 = df.copy(deep=True)
enc = OrdinalEncoder()
enc.set_output(transform = 'pandas')


cat_attr = df1.columns[(df1.applymap(type) == str).all(0)]

df_cat = df1[cat_attr] # SELECT df.attI from df
df_cat = enc.fit_transform(df_cat)
df1[cat_attr] = df_cat
print(df1)


  cat_attr = df1.columns[(df1.applymap(type) == str).all(0)]


       Hobby  Country  Student  FormalEducation  UndergradMajor  DevType  \
0        1.0     18.0      0.0              1.0             3.0      4.0   
1        1.0     18.0      0.0              1.0             6.0      0.0   
2        1.0     19.0      0.0              8.0             6.0      0.0   
3        1.0     19.0      0.0              8.0             7.0      0.0   
4        0.0     10.0      1.0              7.0             6.0      0.0   
...      ...      ...      ...              ...             ...      ...   
38085    1.0      2.0      1.0              8.0             8.0     18.0   
38086    1.0     18.0      1.0              1.0             6.0     18.0   
38087    1.0      7.0      1.0              1.0             6.0      0.0   
38088    1.0      6.0      1.0              7.0             6.0     18.0   
38089    1.0      7.0      1.0              1.0             9.0      0.0   

       YearsCoding  HoursComputer  Exercise  Gender  SexualOrientation  \
0            

In [82]:
cg1 = pc(df1.to_numpy())

  0%|          | 0/20 [00:00<?, ?it/s]

Depth=10, working on node 19: 100%|██████████| 20/20 [00:00<00:00, 781.54it/s] 


In [83]:
g1 = pgv.AGraph(directed=True)
g1.layout(prog="dot")
attrs = df.columns
nodes=cg1.G.get_nodes()
edges=cg1.G.get_graph_edges()
def name_map(name):
    idx = str(name).split('X')[1]
    return attrs[int(idx)-1]
for j in nodes:
    g1.add_node(name_map(j))
for i in edges:
    g1.add_edge(name_map(i.get_node1()),name_map(i.get_node2()))
print(g.to_string())
g1.write("file.dot")
g1.draw("file.png")



strict digraph "" {
	graph [bb="0,0,0,0"];
	node [label="\N"];
	Country -> Student;
	Country -> Continent;
	Country -> GINI;
	Student -> FormalEducation;
	FormalEducation -> Country;
	UndergradMajor -> FormalEducation;
	UndergradMajor -> DevType;
	UndergradMajor -> YearsCoding;
	UndergradMajor -> Gender;
	UndergradMajor -> EducationParents;
	UndergradMajor -> Age;
	DevType -> Student;
	DevType -> FormalEducation;
	DevType -> YearsCoding;
	DevType -> HoursComputer;
	DevType -> Gender;
	DevType -> Age;
	YearsCoding -> Country;
	YearsCoding -> Student;
	YearsCoding -> FormalEducation;
	YearsCoding -> Age;
	HoursComputer -> Hobby;
	HoursComputer -> Country;
	HoursComputer -> Student;
	HoursComputer -> FormalEducation;
	HoursComputer -> YearsCoding;
	HoursComputer -> Age;
	Exercise -> Country;
	Exercise -> FormalEducation;
	Exercise -> HoursComputer;
	Gender -> Hobby;
	SexualOrientation -> Gender;
	EducationParents -> Country;
	EducationParents -> FormalEducation;
	EducationParents -> Years

as required by the -n flag

