Large diffs are not rendered by default.

@@ -0,0 +1,74 @@
weight,BMI
height,gender
diabetes,endocrineNutritionalMetabolicImmunity
sBP,dBP
height,BMI
age,circulatory
diabetes,age
gender,weight
circulatory,sBP
digestive,other
respiratory,other
state,mentalDisorders
senseOrgans,respiratory
diabetes,circulatory
other,state
musculoskeletal,digestive
other,genitourinary
musculoskeletal,injuryPoisoning
digestive,respiratory
skinSubcutaneousTissue,infectiousParasitic
other,blood
mentalDisorders,nervous
endocrineNutritionalMetabolicImmunity,blood
musculoskeletal,nervous
age,musculoskeletal
BMI,diabetes
respiratory,infectiousParasitic
respiratory,skinSubcutaneousTissue
musculoskeletal,other
circulatory,state
endocrineNutritionalMetabolicImmunity,digestive
injuryPoisoning,senseOrgans
musculoskeletal,respiratory
pregnancyChildbirthPuerperium,age
gender,pregnancyChildbirthPuerperium
blood,genitourinary
other,skinSubcutaneousTissue
age,neoplasms
musculoskeletal,mentalDisorders
injuryPoisoning,skinSubcutaneousTissue
circulatory,blood
circulatory,digestive
gender,musculoskeletal
digestive,senseOrgans
pregnancyChildbirthPuerperium,genitourinary
pregnancyChildbirthPuerperium,state
injuryPoisoning,other
digestive,genitourinary
senseOrgans,infectiousParasitic
musculoskeletal,senseOrgans
digestive,blood
musculoskeletal,skinSubcutaneousTissue
endocrineNutritionalMetabolicImmunity,other
skinSubcutaneousTissue,genitourinary
musculoskeletal,congenitalAnomalies
circulatory,mentalDisorders
digestive,nervous
pregnancyChildbirthPuerperium,mentalDisorders
injuryPoisoning,respiratory
other,infectiousParasitic
neoplasms,digestive
perinatal,genitourinary
digestive,injuryPoisoning
age,endocrineNutritionalMetabolicImmunity
senseOrgans,skinSubcutaneousTissue
congenitalAnomalies,mentalDisorders
circulatory,endocrineNutritionalMetabolicImmunity
pregnancyChildbirthPuerperium,infectiousParasitic
perinatal,age
perinatal,senseOrgans
perinatal,musculoskeletal
pregnancyChildbirthPuerperium,other
perinatal,mentalDisorders
perinatal,endocrineNutritionalMetabolicImmunity
@@ -0,0 +1,42 @@
import sys
import re


def loadDictionary(filename):
d = dict()
fo = open(filename)
line = fo.readline().rstrip()
while line:
line = [x.strip() for x in line.split(",")]
d[line[0]] = line[1]
line = fo.readline().rstrip()

fo.close()
return d

# argv[1]: infile (file to replace words)
# argv[2]: outfile (w/e you want)
# argv[3]: dictionary
# USAGE: python translate.py ../output/train3.gph train3_translated.gph code_dictionary.csv
def main(argv):
d = dict()
d = loadDictionary(argv[3])

infile = open(argv[1])
outfile = open(argv[2], "w+")

line = infile.readline().rstrip()
while line:
outline = []
line = [x.strip() for x in line.split(",")]
for elem in line:
m=re.match('(?:code)([0-9]*)', elem)
if m:
outline.append(d[m.group(1)])
else:
outline.append(elem)

outfile.write(",".join(outline)+"\n")
line = infile.readline().rstrip()
if __name__ == "__main__":
main(sys.argv)

Large diffs are not rendered by default.

@@ -0,0 +1,35 @@
using BayesNets
using DataFrames
using Graphs

testfile = "test.csv"
trainfile = "train.csv"

testset = readtable(testfile)
trainset = readtable(trainfile)

for arg = 1:length(ARGS)
inname = ARGS[arg]
bTest = BayesNet(names(testset))
bTest.domains = [DiscreteDomain([x for x in unique(testset[label])])
for label in names(testset)]
bTrain = BayesNet(names(trainset))
bTrain.domains = [DiscreteDomain([x for x in unique(trainset[label])])
for label in names(trainset)]

fin = open(inname, "r")
lines = readlines(fin)
close(fin)
for line in lines
nodes = split(line, ", ")
src = convert(Symbol, nodes[1])
tgt = convert(Symbol, nodes[2][1:end-1])
addEdge!(bTest, src, tgt)
addEdge!(bTrain, src, tgt)
end # for line

@printf("\nThe logBayesScore for %s on %s is %.4f\n",
inname, testfile, logBayesScore(bTest, testset))
@printf("The logBayesScore for %s on %s is %.4f\n",
inname, trainfile, logBayesScore(bTrain, trainset))
end # for arg
Binary file not shown.
@@ -0,0 +1,18 @@
#!/usr/bin/env python

# read in files and transform into formatted data we want to work with

# run naive bayes classifier for data

# run structure learning for Bayesian network (BN)
# define scoring function (read: reward function) for local search heuristic
# run search heuristics:
# local search (pure greedy)
# local search with tabu list
# beam search
# K2 search
# manually look at BN and change structure to fit intuition

# use inference techniques to get conditional probabilities

# check error rates, shazaam
@@ -0,0 +1,33 @@
['sBP', 'weight', 'height', 'code240279', 'code390459', 'code780799', 'dBP', 'BMI', 'state', 'code360389', 'code520579', 'code001139', 'code140239', 'code760779', 'code630679', 'code580629', 'code710739', 'code460519', 'diabetes', 'code280289', 'code290319', 'gender', 'age', 'code320359', 'code680709', 'code800999', 'code740759']
27 nodes loaded into graph
8954 rows loaded
Running Gibbs sampling for 50 samples
Sample number 1 took 48.430229187 sec
Sample number 2 took 41.3028028011 sec
Sample number 3 took 45.4854249954 sec
Sample number 4 took 42.2492589951 sec
Sample number 5 took 38.1048138142 sec
Sample number 6 took 35.6485199928 sec
Sample number 7 took 39.6012690067 sec
Sample number 8 took 31.045992136 sec
Sample number 9 took 32.5555639267 sec
Sample number 10 took 34.3150451183 sec
Sample number 11 took 36.6224348545 sec
Sample number 12 took 36.5677089691 sec
Sample number 13 took 39.989606142 sec
Sample number 14 took 40.1501059532 sec
Sample number 15 took 37.1472861767 sec
Sample number 16 took 39.0639262199 sec
Sample number 17 took 53.0718319416 sec
Sample number 18 took 45.1965010166 sec
Sample number 19 took 38.2876770496 sec
Sample number 20 took 55.4608819485 sec
Sample number 21 took 41.4158949852 sec
Sample number 22 took 42.9269750118 sec
Sample number 23 took 43.6528449059 sec
Sample number 24 took 33.5439560413 sec
Sample number 25 took 41.4569878578 sec
Sample number 26 took 42.01486516 sec
Sample number 27 took 37.0272641182 sec
Sample number 28 took 41.1727819443 sec
Sample number 29 took 40.1693019867 sec
@@ -0,0 +1,34 @@
########################
# comparative error rates of graphs
########################

Graph Error Rate FP FN
1 0.18 0.053 0.127
2 0.187 0 0.187
3 0.153 0.03 0.123
4 0.188 0 0.188
5 0.183 0.017 0.166
6 0.185 0.022 0.163
7 0.181 0.02 0.161
8 0.186 0.004 0.182

###########################################################################################
# Inference Results with 20 Iterations and Increasing number of Missing Data Fields
###########################################################################################

Iterations Missing Error rate FP Rate FN Rate cputime (sec) hours minutes avg sex
20 3 0.1662 0.0363 0.1299 3728.134606 1.035592946 2.135576769 3.754415515
20 5 0.1511 0.0302 0.1208 6392.107252 1.775585348 46.53512086 6.437167424
20 8 0.1339 0.0272 0.1067 9604.038301 2.667788417 40.06730502 9.671740484
20 10 0.1299 0.0252 0.1047 11775.68267 3.271022964 16.26137782 11.85869352

###########################################################################################
# Inference Results with 5 Missing Data Record Fields and Increasing Iterations
###########################################################################################
Iterations Missing Error Rate FP Rate FN Rate CPU time (sec)
20 5 0.1511 0.0302 0.1208 6392.1073
30 5 0.1591 0.0322 0.1269 10145.1417
40 5 0.1682 0.0373 0.1309 12123.9852
50 5 0.1571 0.0282 0.1289 16181.3449

20 adjNodes 0.04330312185 0.01007049345 0.0332326284 18502.43409
@@ -0,0 +1,46 @@
height, gender
code390459, code240279
BMI, weight
code290319, code320359
code240279, age
code390459, age
code710739, code520579
dBP, sBP
diabetes, code240279
code240279, code280289
code710739, code780799
code001139, code680709
code780799, code460519
code280289, code580629
code740759, code710739
code140239, code360389
code360389, code780799
code360389, code680709
code460519, code800999
code520579, code290319
code580629, code001139
code390459, code520579
code360389, code460519
code680709, code800999
sBP, code390459
code390459, diabetes
code320359, code001139
code780799, code520579
gender, code630679
code390459, code320359
code390459, code710739
code780799, code580629
code740759, code800999
code710739, code360389
code740759, code390459
code630679, code240279
code710739, code290319
diabetes, code580629
code630679, code001139
height, weight
code580629, code680709
code740759, code460519
code280289, code780799
code630679, code580629
code630679, code280289
code630679, code460519
Binary file not shown.
@@ -0,0 +1,69 @@
height, gender
dBP, sBP
state, code780799
code390459, code240279
age, code390459
BMI, weight
diabetes, age
code710739, code520579
age, code240279
state, code290319
height, weight
code780799, code710739
code460519, code360389
BMI, diabetes
code240279, code280289
code680709, code001139
state, code460519
diabetes, code390459
code710739, code800999
age, code710739
code780799, code520579
code710739, code320359
code630679, age
code780799, code580629
code780799, code680709
code290319, code320359
diabetes, code240279
code630679, state
code680709, code800999
code780799, code280289
code460519, code001139
age, code140239
code390459, state
code780799, code360389
code460519, code680709
code240279, code780799
gender, code630679
code280289, code580629
code280289, code520579
code240279, code520579
code800999, code360389
code520579, code290319
code390459, code280289
code460519, code710739
code460519, code520579
code780799, code460519
code780799, code800999
diabetes, gender
code710739, code680709
code780799, code001139
code630679, code290319
code360389, code001139
code630679, code580629
code680709, code580629
code390459, code780799
code740759, code290319
code710739, code740759
code140239, code580629
code390459, dBP
code460519, code320359
code680709, code360389
code630679, code001139
code240279, code460519
code760779, code360389
code390459, code290319
code760779, code240279
code760779, code580629
code760779, code710739
code760779, age
Binary file not shown.
@@ -0,0 +1,75 @@
weight, BMI
height, BMI
code390459, diabetes
height, gender
code390459, age
dBP, sBP
gender, weight
code240279, code390459
code520579, code780799
code240279, age
sBP, code390459
code460519, code780799
code710739, code520579
code780799, state
state, code290319
code710739, code800999
code360389, code460519
code240279, diabetes
code780799, code580629
age, code710739
code290319, code320359
code520579, code460519
code460519, code680709
code710739, code780799
age, code140239
BMI, code240279
code780799, code001139
code710739, code320359
code390459, state
code240279, code520579
code360389, code680709
age, code630679
code460519, code001139
code280289, code580629
gender, code710739
code710739, code290319
code280289, code520579
code710739, code360389
code360389, code001139
code280289, code780799
code710739, code680709
code710739, code460519
code280289, code360389
code630679, code580629
code630679, state
diabetes, age
code680709, code580629
code390459, code520579
code740759, code290319
code240279, code280289
code390459, code280289
code710739, code280289
code710739, code740759
code520579, code320359
code140239, code580629
code390459, code290319
code360389, code800999
code760779, code580629
code760779, code800999
code630679, code290319
code800999, code780799
gender, diabetes
code520579, code360389
code760779, code360389
code800999, code680709
code760779, code710739
code630679, code780799
code680709, code001139
code460519, code800999
code630679, code001139
code780799, code680709
code760779, code390459
code630679, code460519
code760779, code290319
code140239, code520579
Binary file not shown.
@@ -0,0 +1,74 @@
weight, BMI
height, gender
diabetes, code240279
sBP, dBP
height, BMI
age, code390459
diabetes, age
gender, weight
code390459, sBP
code520579, code780799
code460519, code780799
state, code290319
code360389, code460519
diabetes, code390459
code780799, state
code710739, code520579
code780799, code580629
code710739, code800999
code520579, code460519
code680709, code001139
code780799, code280289
code290319, code320359
code240279, code280289
code710739, code320359
age, code710739
BMI, diabetes
code460519, code001139
code460519, code680709
code710739, code780799
code390459, state
code240279, code520579
code800999, code360389
code710739, code460519
code630679, age
gender, code630679
code280289, code580629
code780799, code680709
age, code140239
code710739, code290319
code800999, code680709
code390459, code280289
code390459, code520579
gender, code710739
code520579, code360389
code630679, code580629
code630679, state
code800999, code780799
code520579, code580629
code360389, code001139
code710739, code360389
code520579, code280289
code710739, code680709
code240279, code780799
code680709, code580629
code710739, code740759
code390459, code290319
code520579, code320359
code630679, code290319
code800999, code460519
code780799, code001139
code140239, code520579
code760779, code580629
code520579, code800999
age, code240279
code360389, code680709
code740759, code290319
code390459, code240279
code630679, code001139
code760779, age
code760779, code360389
code760779, code710739
code630679, code780799
code760779, code290319
code760779, code240279
Binary file not shown.
@@ -0,0 +1,71 @@
dBP, sBP
BMI, weight
code240279, age
gender, height
code780799, code460519
code390459, age
code390459, code240279
code390459, diabetes
code710739, code520579
height, weight
sBP, code390459
code710739, code800999
code390459, state
code240279, diabetes
state, code290319
state, code780799
code460519, code360389
age, code710739
code780799, code580629
code290319, code320359
state, code460519
code780799, code280289
code460519, code001139
code290319, code710739
code460519, code680709
code390459, code280289
BMI, gender
code240279, code280289
code710739, code780799
age, code140239
code780799, code360389
code710739, code320359
code390459, code630679
state, code240279
code630679, state
code710739, code680709
code360389, code800999
code280289, code580629
code680709, code360389
code710739, code460519
code680709, code800999
code390459, code290319
code520579, code280289
gender, code630679
code520579, code580629
code780799, code001139
code390459, code520579
code630679, age
gender, code390459
gender, code710739
code001139, code360389
code630679, code580629
diabetes, age
code630679, code290319
code780799, code800999
code780799, code680709
code710739, code740759
code520579, code320359
code140239, code580629
code680709, code001139
code580629, code680709
code760779, code710739
code760779, code800999
code760779, code360389
code760779, code580629
code630679, code001139
code630679, code240279
code240279, code780799
code780799, code520579
code460519, code520579
gender, diabetes
Binary file not shown.
@@ -0,0 +1,71 @@
age, code390459
BMI, weight
height, weight
height, gender
code390459, diabetes
age, code240279
dBP, sBP
state, code780799
code780799, code460519
code240279, diabetes
code460519, code360389
code780799, code520579
state, code290319
code780799, code580629
state, code460519
code390459, code240279
code710739, age
code290319, code320359
code780799, code001139
code460519, code680709
code710739, code320359
code780799, code800999
code780799, code280289
code630679, state
code460519, code520579
code780799, code360389
age, code140239
code800999, code680709
code630679, age
code280289, code580629
code460519, code001139
gender, code630679
code390459, code280289
code710739, code290319
code710739, code680709
code360389, code001139
gender, age
diabetes, BMI
code520579, code280289
code460519, code800999
code390459, dBP
code520579, code320359
code630679, code710739
code390459, code290319
gender, code390459
code630679, code580629
code520579, code580629
code740759, code290319
code680709, code580629
code280289, code240279
code630679, code290319
code710739, code520579
code680709, code001139
code390459, code520579
code780799, code680709
state, code710739
code710739, code800999
code800999, code360389
code710739, code460519
code630679, code001139
code760779, code580629
code760779, code360389
code680709, code360389
gender, diabetes
code760779, age
code760779, code280289
code710739, code780799
code760779, code290319
code760779, code800999
code390459, code780799
code740759, code710739
Binary file not shown.
@@ -0,0 +1,73 @@
weight, BMI
dBP, sBP
age, code390459
code240279, age
height, BMI
gender, weight
diabetes, age
code520579, code710739
code780799, code520579
state, code780799
state, code290319
diabetes, code390459
code710739, code800999
code780799, code710739
code780799, code580629
code240279, code280289
code780799, code680709
sBP, diabetes
state, code460519
height, gender
code710739, code320359
code290319, code320359
code240279, code390459
code460519, code520579
code780799, code280289
code360389, code800999
code630679, state
age, code140239
code460519, code360389
code360389, code680709
code390459, code710739
code460519, code680709
code630679, age
code780799, code360389
code710739, code290319
code390459, code280289
code460519, code001139
code280289, code580629
code680709, code710739
code390459, gender
code780799, code800999
code360389, code001139
gender, code710739
code520579, code320359
code710739, code740759
code680709, code800999
code680709, code580629
code240279, code520579
code280289, code520579
code780799, code001139
code390459, code290319
state, code240279
code390459, code520579
code630679, code240279
code520579, code580629
code001139, code680709
code740759, code290319
code630679, code710739
code280289, code360389
code760779, code360389
code760779, code680709
code780799, code460519
code240279, code780799
code240279, code460519
code390459, code780799
code760779, code580629
diabetes, code240279
code630679, code580629
code760779, age
code630679, code290319
diabetes, code630679
code760779, code290319
code760779, code800999
Binary file not shown.
@@ -0,0 +1,68 @@
height, gender
BMI, weight
age, code240279
code390459, sBP
height, weight
sBP, dBP
code390459, diabetes
code780799, code460519
age, code390459
code710739, code520579
code710739, code800999
code460519, code360389
code680709, code001139
code320359, code290319
state, code290319
state, code460519
code240279, diabetes
code780799, code580629
code710739, code780799
code240279, code280289
code780799, code280289
code710739, code320359
code460519, code001139
code780799, code360389
gender, code630679
code140239, age
code630679, age
code710739, code680709
state, code320359
code390459, state
code390459, code520579
code520579, code280289
code360389, code001139
code280289, code580629
age, code710739
gender, code710739
code520579, code580629
code630679, state
code630679, code580629
state, code780799
code780799, code520579
code240279, code780799
diabetes, BMI
code780799, code680709
code460519, code680709
code780799, code800999
code710739, code740759
code390459, code280289
code460519, code520579
code390459, code240279
code780799, code001139
code710739, code460519
gender, code390459
code760779, age
code680709, code800999
code740759, code290319
gender, diabetes
code630679, code140239
code630679, code290319
code760779, code580629
code760779, code360389
code630679, code001139
code680709, code360389
code360389, code580629
code800999, code360389
code760779, code710739
code390459, code290319
code760779, code240279
Binary file not shown.
@@ -0,0 +1,73 @@
BMI, weight
dBP, sBP
height, weight
code390459, age
sBP, code390459
gender, height
code390459, diabetes
code390459, code240279
state, code780799
state, code290319
code680709, code001139
code240279, age
code460519, code520579
code710739, code800999
code780799, code580629
code780799, code710739
code390459, state
state, code460519
code460519, code680709
code390459, code280289
code460519, code780799
BMI, gender
code460519, code001139
age, code140239
code710739, code320359
code390459, code520579
code460519, code360389
code630679, state
code780799, code280289
code290319, code320359
code780799, code680709
gender, code630679
code780799, code360389
code390459, code710739
code280289, code580629
code680709, code800999
code390459, code630679
code800999, code360389
code520579, code240279
code360389, code001139
code780799, code800999
code630679, age
gender, code390459
gender, code710739
diabetes, age
code630679, code580629
code580629, code680709
code390459, code290319
code290319, code460519
code140239, code580629
code680709, code360389
code630679, code710739
code520579, code280289
diabetes, code580629
code520579, code320359
code780799, code001139
code780799, code520579
code390459, code780799
code630679, code290319
code240279, diabetes
code280289, code240279
code740759, code290319
code760779, code360389
code760779, code580629
code630679, code001139
code760779, code280289
gender, diabetes
code630679, code240279
code710739, code680709
code290319, code710739
code460519, code710739
code710739, code520579
code760779, code710739
Binary file not shown.
@@ -0,0 +1,77 @@
using BayesNets
using DataFrames
using Graphs

############################################################
# Functions
############################################################

function getParentsAndDomains(node::Int64, b::BayesNet)
#=
Returns a list of node's parents, a list of the parents'
domains, and a boolean indicating whether the node is a
source node.
=#
parents = Int64[]
pDomains = DiscreteDomain[]
for edge in b.dag.edges
if edge.target == node
push!(parents, edge.source)
push!(pDomains, b.domains[edge.source])
end # if
end # for edge
return parents, pDomains, isempty(parents)
end # function getParentsAndDomains

############################################################
# Main script
############################################################

# reconstruct Bayes net object
inname = ARGS[1]
title = splitext(inname)[1]
dataset = readtable(title * ".csv")

b = BayesNet(names(dataset))
b.domains = [DiscreteDomain([x for x in unique(dataset[label])])
for label in names(dataset)]

name2index = b.index
index2name = Dict{Int64,Symbol}()
for key in keys(b.index)
index2name[b.index[key]] = key
end # for key

fin = open(title * ".gph", "r")
lines = readlines(fin)
close(fin)
for line in lines
nodes = split(line, ", ")
src = convert(Symbol, nodes[1])
tgt = convert(Symbol, nodes[2][1:end-1])
addEdge!(b, src, tgt)
end # for line


# initialize counts (MLE w/ Laplace smoothing)
counts = Dict()
for name in b.names
node = name2index[name]
parents, pDomains, isSource = getParents(node, b)

end # for name


# count all training samples
samples = array(dataset)
nSamples = size(samples, 1)

for distribution in distributions
for sample = 1:nSamples

end # for sample
end # for distribution


# normalize over all cpds

@@ -0,0 +1,20 @@
class Node:
def __init__(self, number):
self.parents = []
self.children = []
self.id = number

def getParents(self):
return self.parents

def getChildren(self):
return self.children

def addParent(self, node):
self.parents.append(node)

def addChild(self, node):
self.children.append(node)

def getID(self):
return self.id
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,63 @@
import sys
from probabilityTable import *

class ParamLearn:

# countFile = e.g. train2.csv
def __init__(self, countFile):
self.pTable = PTable(countFile)
self.domains = {}
self.domains["state"] = ["AK","AL","AR","AZ","CA","CO","CT","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"]
self.domains["gender"] = ["M","F"]
self.domains["age"] = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
self.domains["height"] = range(40,90)
self.domains["weight"] = range(0,490,10)
self.domains["BMI"] = range(0, 170)
self.domains["sBP"] = range(10, 225, 5)
self.domains["dBP"] = range(0, 150, 5)
self.domains["diabetes"] = [0,1]
self.domains["code001139"] = [0,1]
self.domains["code140239"] = [0,1]
self.domains["code240279"] = [0,1]
self.domains["code280289"] = [0,1]
self.domains["code290319"] = [0,1]
self.domains["code320359"] = [0,1]
self.domains["code360389"] = [0,1]
self.domains["code390459"] = [0,1]
self.domains["code460519"] = [0,1]
self.domains["code520579"] = [0,1]
self.domains["code580629"] = [0,1]
self.domains["code630679"] = [0,1]
self.domains["code680709"] = [0,1]
self.domains["code710739"] = [0,1]
self.domains["code740759"] = [0,1]
self.domains["code760779"] = [0,1]
self.domains["code780799"] = [0,1]
self.domains["code800999"] = [0,1]

def getParentJointProb(self, node_id, node_val, params):

#parentCount
parentCount = self.pTable.getCounts(params)
#posteriorCount
params[node_id] = node_val
postCount = self.pTable.getCounts(params)
x= (postCount+1.0) / (parentCount+len(self.domains[node_id]))
return x
def getParentChildJointProb(self, node_id, node_val, parentDict, childDict):

jointParams = dict(parentDict.items()+childDict.items())
# jointCount = self.pTable.getCounts(jointParams)
# #posteriorCount
# jointParams[node_id] = node_val
# postCount = self.pTable.getCounts(jointParams)
# return postCount*1.0 / (jointCount+0.0000000001)
x = self.getParentJointProb(node_id, node_val, jointParams)
return x

def main(argv):
countFile = argv[1]
pLearn = ParamLearn(countFile)

if __name__ == "__main__":
main(sys.argv)
Binary file not shown.
@@ -0,0 +1,82 @@
#from collections import Counter

import sqlite3
class PTable:

def __init__(self, filename):
self.conn = sqlite3.connect('example.db')
c = self.conn.cursor()
c.execute('''DROP TABLE if EXISTS counts''')
c.execute('''CREATE TABLE counts
( state string,
gender string,
age string,
height string,
weight string,
BMI string,
sBP string,
dBP string,
diabetes string,
code001139 string,
code140239 string,
code240279 string,
code280289 string,
code290319 string,
code320359 string,
code360389 string,
code390459 string,
code460519 string,
code520579 string,
code580629 string,
code630679 string,
code680709 string,
code710739 string,
code740759 string,
code760779 string,
code780799 string,
code800999 string)''')

c.execute('''PRAGMA table_info(counts);''')
headers = c.fetchall()
for header in headers:
c.execute('''CREATE INDEX %s_index ON counts (%s);''' %(header[1], header[1]))

self.loadTable(filename)


def loadTable(self, filename):
fo = open(filename)
c = self.conn.cursor()

headers = fo.readline().rstrip()
headers = headers.split(',')
self.headers = ", ".join( ["["+str(h)+"]" for h in headers] )

line = fo.readline().rstrip()
while line:
row = line.split(',')
c.execute("INSERT INTO counts (%s) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" %self.headers, row)
line = fo.readline().rstrip()

c.execute("SELECT count(*) FROM counts")
print str(c.fetchone()[0]) + " rows loaded"


def getCounts(self, paramsDict):
colNames = []
colVals = []
for key, val in paramsDict.iteritems():
colNames.append(key)
colVals.append(val)
query = "SELECT count(*) from counts WHERE "
query += " AND ".join(e + "=?" for e in colNames)

c = self.conn.cursor()
return c.execute(query, colVals).fetchone()[0]







Binary file not shown.
@@ -0,0 +1,26 @@
import unittest
from paramLearn import *
from tree import *

class TestSubmission(unittest.TestCase):

def test_paramLearn_getChildren(self):
graph = Tree('testData.gph')
pLearn = ParamLearn('testData.csv')
pLearn.domains["sBP"] = [1,0]
pLearn.domains["code280289"] = [1,0]
node_id = 'sBP'
node_val = '1'
params = {'code280289': '0'}
self.assertEquals(pLearn.getParentJointProb(node_id, node_val, params), (1.0/2.0))

def test_nodeTree(self):
graph = Tree('testData.gph')

n = graph.getNode('sBP')
self.assertEquals(n.id, 'sBP')
self.assertEquals([p.id for p in n.getChildren()], ['weight', 'dBP'])
self.assertEquals([p.id for p in n.getParents()], ['code280289'])

if __name__ == '__main__':
unittest.main()
@@ -0,0 +1,5 @@
code280289,sBP,code240279,weight,code780799,code360389,height,code680709,dBP,code320359,BMI,state,code740759,code001139,code460519,code520579,code710739,code760779,code800999,code390459,diabetes,code290319,gender,age,code630679,code580629,code140239
0,1,1,270,0,0,69,0,80,0,41,SD,0,0,0,0,0,0,0,0,0,0,M,60,0,0,1
1,1,1,140,1,0,64,0,60,0,24,VA,0,0,1,0,1,0,0,1,0,0,F,85,0,0,0
0,0,1,220,1,0,68,0,70,0,33,NV,0,0,0,0,1,0,1,0,0,0,M,45,0,0,0
1,0,1,200,0,0,72,0,80,0,27,CA,0,0,1,0,0,0,0,1,0,0,M,60,0,0,0
@@ -0,0 +1,3 @@
code280289, sBP
sBP, weight
sBP, dBP
@@ -0,0 +1,39 @@
from node import *
import sys

class Tree:

# edgeFile = e.g. train3.gph
def __init__(self, edgeFile):
self.nodeDict = dict()
self.loadNodes(edgeFile)

def getNode(self, id_num):
return self.nodeDict[id_num]

def retrieveOrCreateNode(self,id_num):
if id_num in self.nodeDict:
return self.nodeDict[id_num]
else:
self.nodeDict[id_num] = Node(id_num)
return self.nodeDict[id_num]

#read in .gph file and populate nodes with data
def loadNodes(self, edgeFile):
fo = open(edgeFile)
line = fo.readline().rstrip()
while line:
node_ids = line.split(', ')
node1 = self.retrieveOrCreateNode(node_ids[0])
node2 = self.retrieveOrCreateNode(node_ids[1])
node1.addChild(node2)
node2.addParent(node1)
line = fo.readline().rstrip()

def main(argv):
edgeFile = argv[1]
tree = Tree(edgeFile)
print str(len(tree.nodeDict)) + " nodes loaded to tree."

if __name__ == "__main__":
main(sys.argv)
Binary file not shown.
@@ -0,0 +1,10 @@
# Description
This directory contains all the source code for the project.

## Use of Julia files
1. ssh into one of the new corn machines on the farm servers (corn01, corn02, corn07, corn10 work best; otherwise use corn-new)
2. cd into the right directory, or copy over the files into any directory you want to work in
3. from the command line, type in
- e.g., `julia structLearn.jl train.csv`
- e.g., `julia evalBayesNet.jl train1.gph train2.gph train3.gph train4.gph train5.gph train6.gph train7.gph train8.gph`
- e.g., `julia vizBayesNet.jl train1.gph train2.gph train3.gph train4.gph train5.gph train6.gph train7.gph train8.gph`

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -0,0 +1,74 @@
weight,BMI
height,gender
diabetes,endocrineNutritionalMetabolicImmunity
sBP,dBP
height,BMI
age,circulatory
diabetes,age
gender,weight
circulatory,sBP
digestive,other
respiratory,other
state,mentalDisorders
senseOrgans,respiratory
diabetes,circulatory
other,state
musculoskeletal,digestive
other,genitourinary
musculoskeletal,injuryPoisoning
digestive,respiratory
skinSubcutaneousTissue,infectiousParasitic
other,blood
mentalDisorders,nervous
endocrineNutritionalMetabolicImmunity,blood
musculoskeletal,nervous
age,musculoskeletal
BMI,diabetes
respiratory,infectiousParasitic
respiratory,skinSubcutaneousTissue
musculoskeletal,other
circulatory,state
endocrineNutritionalMetabolicImmunity,digestive
injuryPoisoning,senseOrgans
musculoskeletal,respiratory
pregnancyChildbirthPuerperium,age
gender,pregnancyChildbirthPuerperium
blood,genitourinary
other,skinSubcutaneousTissue
age,neoplasms
musculoskeletal,mentalDisorders
injuryPoisoning,skinSubcutaneousTissue
circulatory,blood
circulatory,digestive
gender,musculoskeletal
digestive,senseOrgans
pregnancyChildbirthPuerperium,genitourinary
pregnancyChildbirthPuerperium,state
injuryPoisoning,other
digestive,genitourinary
senseOrgans,infectiousParasitic
musculoskeletal,senseOrgans
digestive,blood
musculoskeletal,skinSubcutaneousTissue
endocrineNutritionalMetabolicImmunity,other
skinSubcutaneousTissue,genitourinary
musculoskeletal,congenitalAnomalies
circulatory,mentalDisorders
digestive,nervous
pregnancyChildbirthPuerperium,mentalDisorders
injuryPoisoning,respiratory
other,infectiousParasitic
neoplasms,digestive
perinatal,genitourinary
digestive,injuryPoisoning
age,endocrineNutritionalMetabolicImmunity
senseOrgans,skinSubcutaneousTissue
congenitalAnomalies,mentalDisorders
circulatory,endocrineNutritionalMetabolicImmunity
pregnancyChildbirthPuerperium,infectiousParasitic
perinatal,age
perinatal,senseOrgans
perinatal,musculoskeletal
pregnancyChildbirthPuerperium,other
perinatal,mentalDisorders
perinatal,endocrineNutritionalMetabolicImmunity
Binary file not shown.
@@ -0,0 +1,35 @@
using BayesNets
using DataFrames
using Graphs
using TikzGraphs
using TikzPictures

dataset = readtable("train2_translated.csv")

for arg = 1:length(ARGS)
arg = 1
inname = ARGS[arg]
title = splitext(inname)[1]
outname = title * ".pdf"

b = BayesNet(names(dataset))
b.domains = [DiscreteDomain([x for x in unique(dataset[label])])
for label in names(dataset)]

fin = open(inname, "r")
lines = readlines(fin)
close(fin)

for i = 1:length(lines)
line = lines[i]
nodes = split(line, ",")
src = convert(Symbol, nodes[1])
tgt = convert(Symbol, nodes[2][1:end-1])
addEdge!(b, src, tgt)
end # for line

save(b::BayesNet, filename::String) = TikzPictures.save(PDF(filename), TikzGraphs.plot(b.dag, ASCIIString[string(s) for s in b.names]))
save(b, outname)
@printf("Output saved to %s\n", outname)

end # for arg