| @@ -0,0 +1,74 @@ | ||
| weight,BMI | ||
| height,gender | ||
| diabetes,endocrineNutritionalMetabolicImmunity | ||
| sBP,dBP | ||
| height,BMI | ||
| age,circulatory | ||
| diabetes,age | ||
| gender,weight | ||
| circulatory,sBP | ||
| digestive,other | ||
| respiratory,other | ||
| state,mentalDisorders | ||
| senseOrgans,respiratory | ||
| diabetes,circulatory | ||
| other,state | ||
| musculoskeletal,digestive | ||
| other,genitourinary | ||
| musculoskeletal,injuryPoisoning | ||
| digestive,respiratory | ||
| skinSubcutaneousTissue,infectiousParasitic | ||
| other,blood | ||
| mentalDisorders,nervous | ||
| endocrineNutritionalMetabolicImmunity,blood | ||
| musculoskeletal,nervous | ||
| age,musculoskeletal | ||
| BMI,diabetes | ||
| respiratory,infectiousParasitic | ||
| respiratory,skinSubcutaneousTissue | ||
| musculoskeletal,other | ||
| circulatory,state | ||
| endocrineNutritionalMetabolicImmunity,digestive | ||
| injuryPoisoning,senseOrgans | ||
| musculoskeletal,respiratory | ||
| pregnancyChildbirthPuerperium,age | ||
| gender,pregnancyChildbirthPuerperium | ||
| blood,genitourinary | ||
| other,skinSubcutaneousTissue | ||
| age,neoplasms | ||
| musculoskeletal,mentalDisorders | ||
| injuryPoisoning,skinSubcutaneousTissue | ||
| circulatory,blood | ||
| circulatory,digestive | ||
| gender,musculoskeletal | ||
| digestive,senseOrgans | ||
| pregnancyChildbirthPuerperium,genitourinary | ||
| pregnancyChildbirthPuerperium,state | ||
| injuryPoisoning,other | ||
| digestive,genitourinary | ||
| senseOrgans,infectiousParasitic | ||
| musculoskeletal,senseOrgans | ||
| digestive,blood | ||
| musculoskeletal,skinSubcutaneousTissue | ||
| endocrineNutritionalMetabolicImmunity,other | ||
| skinSubcutaneousTissue,genitourinary | ||
| musculoskeletal,congenitalAnomalies | ||
| circulatory,mentalDisorders | ||
| digestive,nervous | ||
| pregnancyChildbirthPuerperium,mentalDisorders | ||
| injuryPoisoning,respiratory | ||
| other,infectiousParasitic | ||
| neoplasms,digestive | ||
| perinatal,genitourinary | ||
| digestive,injuryPoisoning | ||
| age,endocrineNutritionalMetabolicImmunity | ||
| senseOrgans,skinSubcutaneousTissue | ||
| congenitalAnomalies,mentalDisorders | ||
| circulatory,endocrineNutritionalMetabolicImmunity | ||
| pregnancyChildbirthPuerperium,infectiousParasitic | ||
| perinatal,age | ||
| perinatal,senseOrgans | ||
| perinatal,musculoskeletal | ||
| pregnancyChildbirthPuerperium,other | ||
| perinatal,mentalDisorders | ||
| perinatal,endocrineNutritionalMetabolicImmunity |
| @@ -0,0 +1,42 @@ | ||
| import sys | ||
| import re | ||
|
|
||
|
|
||
| def loadDictionary(filename): | ||
| d = dict() | ||
| fo = open(filename) | ||
| line = fo.readline().rstrip() | ||
| while line: | ||
| line = [x.strip() for x in line.split(",")] | ||
| d[line[0]] = line[1] | ||
| line = fo.readline().rstrip() | ||
|
|
||
| fo.close() | ||
| return d | ||
|
|
||
| # argv[1]: infile (file to replace words) | ||
| # argv[2]: outfile (w/e you want) | ||
| # argv[3]: dictionary | ||
| # USAGE: python translate.py ../output/train3.gph train3_translated.gph code_dictionary.csv | ||
| def main(argv): | ||
| d = dict() | ||
| d = loadDictionary(argv[3]) | ||
|
|
||
| infile = open(argv[1]) | ||
| outfile = open(argv[2], "w+") | ||
|
|
||
| line = infile.readline().rstrip() | ||
| while line: | ||
| outline = [] | ||
| line = [x.strip() for x in line.split(",")] | ||
| for elem in line: | ||
| m=re.match('(?:code)([0-9]*)', elem) | ||
| if m: | ||
| outline.append(d[m.group(1)]) | ||
| else: | ||
| outline.append(elem) | ||
|
|
||
| outfile.write(",".join(outline)+"\n") | ||
| line = infile.readline().rstrip() | ||
| if __name__ == "__main__": | ||
| main(sys.argv) |
| @@ -0,0 +1,35 @@ | ||
| using BayesNets | ||
| using DataFrames | ||
| using Graphs | ||
|
|
||
| testfile = "test.csv" | ||
| trainfile = "train.csv" | ||
|
|
||
| testset = readtable(testfile) | ||
| trainset = readtable(trainfile) | ||
|
|
||
| for arg = 1:length(ARGS) | ||
| inname = ARGS[arg] | ||
| bTest = BayesNet(names(testset)) | ||
| bTest.domains = [DiscreteDomain([x for x in unique(testset[label])]) | ||
| for label in names(testset)] | ||
| bTrain = BayesNet(names(trainset)) | ||
| bTrain.domains = [DiscreteDomain([x for x in unique(trainset[label])]) | ||
| for label in names(trainset)] | ||
|
|
||
| fin = open(inname, "r") | ||
| lines = readlines(fin) | ||
| close(fin) | ||
| for line in lines | ||
| nodes = split(line, ", ") | ||
| src = convert(Symbol, nodes[1]) | ||
| tgt = convert(Symbol, nodes[2][1:end-1]) | ||
| addEdge!(bTest, src, tgt) | ||
| addEdge!(bTrain, src, tgt) | ||
| end # for line | ||
|
|
||
| @printf("\nThe logBayesScore for %s on %s is %.4f\n", | ||
| inname, testfile, logBayesScore(bTest, testset)) | ||
| @printf("The logBayesScore for %s on %s is %.4f\n", | ||
| inname, trainfile, logBayesScore(bTrain, trainset)) | ||
| end # for arg |
| @@ -0,0 +1,18 @@ | ||
| #!/usr/bin/env python | ||
|
|
||
| # read in files and transform into formatted data we want to work with | ||
|
|
||
| # run naive bayes classifier for data | ||
|
|
||
| # run structure learning for Bayesian network (BN) | ||
| # define scoring function (read: reward function) for local search heuristic | ||
| # run search heuristics: | ||
| # local search (pure greedy) | ||
| # local search with tabu list | ||
| # beam search | ||
| # K2 search | ||
| # manually look at BN and change structure to fit intuition | ||
|
|
||
| # use inference techniques to get conditional probabilities | ||
|
|
||
| # check error rates, shazaam |
| @@ -0,0 +1,33 @@ | ||
| ['sBP', 'weight', 'height', 'code240279', 'code390459', 'code780799', 'dBP', 'BMI', 'state', 'code360389', 'code520579', 'code001139', 'code140239', 'code760779', 'code630679', 'code580629', 'code710739', 'code460519', 'diabetes', 'code280289', 'code290319', 'gender', 'age', 'code320359', 'code680709', 'code800999', 'code740759'] | ||
| 27 nodes loaded into graph | ||
| 8954 rows loaded | ||
| Running Gibbs sampling for 50 samples | ||
| Sample number 1 took 48.430229187 sec | ||
| Sample number 2 took 41.3028028011 sec | ||
| Sample number 3 took 45.4854249954 sec | ||
| Sample number 4 took 42.2492589951 sec | ||
| Sample number 5 took 38.1048138142 sec | ||
| Sample number 6 took 35.6485199928 sec | ||
| Sample number 7 took 39.6012690067 sec | ||
| Sample number 8 took 31.045992136 sec | ||
| Sample number 9 took 32.5555639267 sec | ||
| Sample number 10 took 34.3150451183 sec | ||
| Sample number 11 took 36.6224348545 sec | ||
| Sample number 12 took 36.5677089691 sec | ||
| Sample number 13 took 39.989606142 sec | ||
| Sample number 14 took 40.1501059532 sec | ||
| Sample number 15 took 37.1472861767 sec | ||
| Sample number 16 took 39.0639262199 sec | ||
| Sample number 17 took 53.0718319416 sec | ||
| Sample number 18 took 45.1965010166 sec | ||
| Sample number 19 took 38.2876770496 sec | ||
| Sample number 20 took 55.4608819485 sec | ||
| Sample number 21 took 41.4158949852 sec | ||
| Sample number 22 took 42.9269750118 sec | ||
| Sample number 23 took 43.6528449059 sec | ||
| Sample number 24 took 33.5439560413 sec | ||
| Sample number 25 took 41.4569878578 sec | ||
| Sample number 26 took 42.01486516 sec | ||
| Sample number 27 took 37.0272641182 sec | ||
| Sample number 28 took 41.1727819443 sec | ||
| Sample number 29 took 40.1693019867 sec |
| @@ -0,0 +1,34 @@ | ||
| ######################## | ||
| # comparative error rates of graphs | ||
| ######################## | ||
|
|
||
| Graph Error Rate FP FN | ||
| 1 0.18 0.053 0.127 | ||
| 2 0.187 0 0.187 | ||
| 3 0.153 0.03 0.123 | ||
| 4 0.188 0 0.188 | ||
| 5 0.183 0.017 0.166 | ||
| 6 0.185 0.022 0.163 | ||
| 7 0.181 0.02 0.161 | ||
| 8 0.186 0.004 0.182 | ||
|
|
||
| ########################################################################################### | ||
| # Inference Results with 20 Iterations and Increasing number of Missing Data Fields | ||
| ########################################################################################### | ||
|
|
||
| Iterations Missing Error rate FP Rate FN Rate cputime (sec) hours minutes avg sex | ||
| 20 3 0.1662 0.0363 0.1299 3728.134606 1.035592946 2.135576769 3.754415515 | ||
| 20 5 0.1511 0.0302 0.1208 6392.107252 1.775585348 46.53512086 6.437167424 | ||
| 20 8 0.1339 0.0272 0.1067 9604.038301 2.667788417 40.06730502 9.671740484 | ||
| 20 10 0.1299 0.0252 0.1047 11775.68267 3.271022964 16.26137782 11.85869352 | ||
|
|
||
| ########################################################################################### | ||
| # Inference Results with 5 Missing Data Record Fields and Increasing Iterations | ||
| ########################################################################################### | ||
| Iterations Missing Error Rate FP Rate FN Rate CPU time (sec) | ||
| 20 5 0.1511 0.0302 0.1208 6392.1073 | ||
| 30 5 0.1591 0.0322 0.1269 10145.1417 | ||
| 40 5 0.1682 0.0373 0.1309 12123.9852 | ||
| 50 5 0.1571 0.0282 0.1289 16181.3449 | ||
|
|
||
| 20 adjNodes 0.04330312185 0.01007049345 0.0332326284 18502.43409 |
| @@ -0,0 +1,46 @@ | ||
| height, gender | ||
| code390459, code240279 | ||
| BMI, weight | ||
| code290319, code320359 | ||
| code240279, age | ||
| code390459, age | ||
| code710739, code520579 | ||
| dBP, sBP | ||
| diabetes, code240279 | ||
| code240279, code280289 | ||
| code710739, code780799 | ||
| code001139, code680709 | ||
| code780799, code460519 | ||
| code280289, code580629 | ||
| code740759, code710739 | ||
| code140239, code360389 | ||
| code360389, code780799 | ||
| code360389, code680709 | ||
| code460519, code800999 | ||
| code520579, code290319 | ||
| code580629, code001139 | ||
| code390459, code520579 | ||
| code360389, code460519 | ||
| code680709, code800999 | ||
| sBP, code390459 | ||
| code390459, diabetes | ||
| code320359, code001139 | ||
| code780799, code520579 | ||
| gender, code630679 | ||
| code390459, code320359 | ||
| code390459, code710739 | ||
| code780799, code580629 | ||
| code740759, code800999 | ||
| code710739, code360389 | ||
| code740759, code390459 | ||
| code630679, code240279 | ||
| code710739, code290319 | ||
| diabetes, code580629 | ||
| code630679, code001139 | ||
| height, weight | ||
| code580629, code680709 | ||
| code740759, code460519 | ||
| code280289, code780799 | ||
| code630679, code580629 | ||
| code630679, code280289 | ||
| code630679, code460519 |
| @@ -0,0 +1,69 @@ | ||
| height, gender | ||
| dBP, sBP | ||
| state, code780799 | ||
| code390459, code240279 | ||
| age, code390459 | ||
| BMI, weight | ||
| diabetes, age | ||
| code710739, code520579 | ||
| age, code240279 | ||
| state, code290319 | ||
| height, weight | ||
| code780799, code710739 | ||
| code460519, code360389 | ||
| BMI, diabetes | ||
| code240279, code280289 | ||
| code680709, code001139 | ||
| state, code460519 | ||
| diabetes, code390459 | ||
| code710739, code800999 | ||
| age, code710739 | ||
| code780799, code520579 | ||
| code710739, code320359 | ||
| code630679, age | ||
| code780799, code580629 | ||
| code780799, code680709 | ||
| code290319, code320359 | ||
| diabetes, code240279 | ||
| code630679, state | ||
| code680709, code800999 | ||
| code780799, code280289 | ||
| code460519, code001139 | ||
| age, code140239 | ||
| code390459, state | ||
| code780799, code360389 | ||
| code460519, code680709 | ||
| code240279, code780799 | ||
| gender, code630679 | ||
| code280289, code580629 | ||
| code280289, code520579 | ||
| code240279, code520579 | ||
| code800999, code360389 | ||
| code520579, code290319 | ||
| code390459, code280289 | ||
| code460519, code710739 | ||
| code460519, code520579 | ||
| code780799, code460519 | ||
| code780799, code800999 | ||
| diabetes, gender | ||
| code710739, code680709 | ||
| code780799, code001139 | ||
| code630679, code290319 | ||
| code360389, code001139 | ||
| code630679, code580629 | ||
| code680709, code580629 | ||
| code390459, code780799 | ||
| code740759, code290319 | ||
| code710739, code740759 | ||
| code140239, code580629 | ||
| code390459, dBP | ||
| code460519, code320359 | ||
| code680709, code360389 | ||
| code630679, code001139 | ||
| code240279, code460519 | ||
| code760779, code360389 | ||
| code390459, code290319 | ||
| code760779, code240279 | ||
| code760779, code580629 | ||
| code760779, code710739 | ||
| code760779, age |
| @@ -0,0 +1,75 @@ | ||
| weight, BMI | ||
| height, BMI | ||
| code390459, diabetes | ||
| height, gender | ||
| code390459, age | ||
| dBP, sBP | ||
| gender, weight | ||
| code240279, code390459 | ||
| code520579, code780799 | ||
| code240279, age | ||
| sBP, code390459 | ||
| code460519, code780799 | ||
| code710739, code520579 | ||
| code780799, state | ||
| state, code290319 | ||
| code710739, code800999 | ||
| code360389, code460519 | ||
| code240279, diabetes | ||
| code780799, code580629 | ||
| age, code710739 | ||
| code290319, code320359 | ||
| code520579, code460519 | ||
| code460519, code680709 | ||
| code710739, code780799 | ||
| age, code140239 | ||
| BMI, code240279 | ||
| code780799, code001139 | ||
| code710739, code320359 | ||
| code390459, state | ||
| code240279, code520579 | ||
| code360389, code680709 | ||
| age, code630679 | ||
| code460519, code001139 | ||
| code280289, code580629 | ||
| gender, code710739 | ||
| code710739, code290319 | ||
| code280289, code520579 | ||
| code710739, code360389 | ||
| code360389, code001139 | ||
| code280289, code780799 | ||
| code710739, code680709 | ||
| code710739, code460519 | ||
| code280289, code360389 | ||
| code630679, code580629 | ||
| code630679, state | ||
| diabetes, age | ||
| code680709, code580629 | ||
| code390459, code520579 | ||
| code740759, code290319 | ||
| code240279, code280289 | ||
| code390459, code280289 | ||
| code710739, code280289 | ||
| code710739, code740759 | ||
| code520579, code320359 | ||
| code140239, code580629 | ||
| code390459, code290319 | ||
| code360389, code800999 | ||
| code760779, code580629 | ||
| code760779, code800999 | ||
| code630679, code290319 | ||
| code800999, code780799 | ||
| gender, diabetes | ||
| code520579, code360389 | ||
| code760779, code360389 | ||
| code800999, code680709 | ||
| code760779, code710739 | ||
| code630679, code780799 | ||
| code680709, code001139 | ||
| code460519, code800999 | ||
| code630679, code001139 | ||
| code780799, code680709 | ||
| code760779, code390459 | ||
| code630679, code460519 | ||
| code760779, code290319 | ||
| code140239, code520579 |
| @@ -0,0 +1,74 @@ | ||
| weight, BMI | ||
| height, gender | ||
| diabetes, code240279 | ||
| sBP, dBP | ||
| height, BMI | ||
| age, code390459 | ||
| diabetes, age | ||
| gender, weight | ||
| code390459, sBP | ||
| code520579, code780799 | ||
| code460519, code780799 | ||
| state, code290319 | ||
| code360389, code460519 | ||
| diabetes, code390459 | ||
| code780799, state | ||
| code710739, code520579 | ||
| code780799, code580629 | ||
| code710739, code800999 | ||
| code520579, code460519 | ||
| code680709, code001139 | ||
| code780799, code280289 | ||
| code290319, code320359 | ||
| code240279, code280289 | ||
| code710739, code320359 | ||
| age, code710739 | ||
| BMI, diabetes | ||
| code460519, code001139 | ||
| code460519, code680709 | ||
| code710739, code780799 | ||
| code390459, state | ||
| code240279, code520579 | ||
| code800999, code360389 | ||
| code710739, code460519 | ||
| code630679, age | ||
| gender, code630679 | ||
| code280289, code580629 | ||
| code780799, code680709 | ||
| age, code140239 | ||
| code710739, code290319 | ||
| code800999, code680709 | ||
| code390459, code280289 | ||
| code390459, code520579 | ||
| gender, code710739 | ||
| code520579, code360389 | ||
| code630679, code580629 | ||
| code630679, state | ||
| code800999, code780799 | ||
| code520579, code580629 | ||
| code360389, code001139 | ||
| code710739, code360389 | ||
| code520579, code280289 | ||
| code710739, code680709 | ||
| code240279, code780799 | ||
| code680709, code580629 | ||
| code710739, code740759 | ||
| code390459, code290319 | ||
| code520579, code320359 | ||
| code630679, code290319 | ||
| code800999, code460519 | ||
| code780799, code001139 | ||
| code140239, code520579 | ||
| code760779, code580629 | ||
| code520579, code800999 | ||
| age, code240279 | ||
| code360389, code680709 | ||
| code740759, code290319 | ||
| code390459, code240279 | ||
| code630679, code001139 | ||
| code760779, age | ||
| code760779, code360389 | ||
| code760779, code710739 | ||
| code630679, code780799 | ||
| code760779, code290319 | ||
| code760779, code240279 |
| @@ -0,0 +1,71 @@ | ||
| dBP, sBP | ||
| BMI, weight | ||
| code240279, age | ||
| gender, height | ||
| code780799, code460519 | ||
| code390459, age | ||
| code390459, code240279 | ||
| code390459, diabetes | ||
| code710739, code520579 | ||
| height, weight | ||
| sBP, code390459 | ||
| code710739, code800999 | ||
| code390459, state | ||
| code240279, diabetes | ||
| state, code290319 | ||
| state, code780799 | ||
| code460519, code360389 | ||
| age, code710739 | ||
| code780799, code580629 | ||
| code290319, code320359 | ||
| state, code460519 | ||
| code780799, code280289 | ||
| code460519, code001139 | ||
| code290319, code710739 | ||
| code460519, code680709 | ||
| code390459, code280289 | ||
| BMI, gender | ||
| code240279, code280289 | ||
| code710739, code780799 | ||
| age, code140239 | ||
| code780799, code360389 | ||
| code710739, code320359 | ||
| code390459, code630679 | ||
| state, code240279 | ||
| code630679, state | ||
| code710739, code680709 | ||
| code360389, code800999 | ||
| code280289, code580629 | ||
| code680709, code360389 | ||
| code710739, code460519 | ||
| code680709, code800999 | ||
| code390459, code290319 | ||
| code520579, code280289 | ||
| gender, code630679 | ||
| code520579, code580629 | ||
| code780799, code001139 | ||
| code390459, code520579 | ||
| code630679, age | ||
| gender, code390459 | ||
| gender, code710739 | ||
| code001139, code360389 | ||
| code630679, code580629 | ||
| diabetes, age | ||
| code630679, code290319 | ||
| code780799, code800999 | ||
| code780799, code680709 | ||
| code710739, code740759 | ||
| code520579, code320359 | ||
| code140239, code580629 | ||
| code680709, code001139 | ||
| code580629, code680709 | ||
| code760779, code710739 | ||
| code760779, code800999 | ||
| code760779, code360389 | ||
| code760779, code580629 | ||
| code630679, code001139 | ||
| code630679, code240279 | ||
| code240279, code780799 | ||
| code780799, code520579 | ||
| code460519, code520579 | ||
| gender, diabetes |
| @@ -0,0 +1,71 @@ | ||
| age, code390459 | ||
| BMI, weight | ||
| height, weight | ||
| height, gender | ||
| code390459, diabetes | ||
| age, code240279 | ||
| dBP, sBP | ||
| state, code780799 | ||
| code780799, code460519 | ||
| code240279, diabetes | ||
| code460519, code360389 | ||
| code780799, code520579 | ||
| state, code290319 | ||
| code780799, code580629 | ||
| state, code460519 | ||
| code390459, code240279 | ||
| code710739, age | ||
| code290319, code320359 | ||
| code780799, code001139 | ||
| code460519, code680709 | ||
| code710739, code320359 | ||
| code780799, code800999 | ||
| code780799, code280289 | ||
| code630679, state | ||
| code460519, code520579 | ||
| code780799, code360389 | ||
| age, code140239 | ||
| code800999, code680709 | ||
| code630679, age | ||
| code280289, code580629 | ||
| code460519, code001139 | ||
| gender, code630679 | ||
| code390459, code280289 | ||
| code710739, code290319 | ||
| code710739, code680709 | ||
| code360389, code001139 | ||
| gender, age | ||
| diabetes, BMI | ||
| code520579, code280289 | ||
| code460519, code800999 | ||
| code390459, dBP | ||
| code520579, code320359 | ||
| code630679, code710739 | ||
| code390459, code290319 | ||
| gender, code390459 | ||
| code630679, code580629 | ||
| code520579, code580629 | ||
| code740759, code290319 | ||
| code680709, code580629 | ||
| code280289, code240279 | ||
| code630679, code290319 | ||
| code710739, code520579 | ||
| code680709, code001139 | ||
| code390459, code520579 | ||
| code780799, code680709 | ||
| state, code710739 | ||
| code710739, code800999 | ||
| code800999, code360389 | ||
| code710739, code460519 | ||
| code630679, code001139 | ||
| code760779, code580629 | ||
| code760779, code360389 | ||
| code680709, code360389 | ||
| gender, diabetes | ||
| code760779, age | ||
| code760779, code280289 | ||
| code710739, code780799 | ||
| code760779, code290319 | ||
| code760779, code800999 | ||
| code390459, code780799 | ||
| code740759, code710739 |
| @@ -0,0 +1,73 @@ | ||
| weight, BMI | ||
| dBP, sBP | ||
| age, code390459 | ||
| code240279, age | ||
| height, BMI | ||
| gender, weight | ||
| diabetes, age | ||
| code520579, code710739 | ||
| code780799, code520579 | ||
| state, code780799 | ||
| state, code290319 | ||
| diabetes, code390459 | ||
| code710739, code800999 | ||
| code780799, code710739 | ||
| code780799, code580629 | ||
| code240279, code280289 | ||
| code780799, code680709 | ||
| sBP, diabetes | ||
| state, code460519 | ||
| height, gender | ||
| code710739, code320359 | ||
| code290319, code320359 | ||
| code240279, code390459 | ||
| code460519, code520579 | ||
| code780799, code280289 | ||
| code360389, code800999 | ||
| code630679, state | ||
| age, code140239 | ||
| code460519, code360389 | ||
| code360389, code680709 | ||
| code390459, code710739 | ||
| code460519, code680709 | ||
| code630679, age | ||
| code780799, code360389 | ||
| code710739, code290319 | ||
| code390459, code280289 | ||
| code460519, code001139 | ||
| code280289, code580629 | ||
| code680709, code710739 | ||
| code390459, gender | ||
| code780799, code800999 | ||
| code360389, code001139 | ||
| gender, code710739 | ||
| code520579, code320359 | ||
| code710739, code740759 | ||
| code680709, code800999 | ||
| code680709, code580629 | ||
| code240279, code520579 | ||
| code280289, code520579 | ||
| code780799, code001139 | ||
| code390459, code290319 | ||
| state, code240279 | ||
| code390459, code520579 | ||
| code630679, code240279 | ||
| code520579, code580629 | ||
| code001139, code680709 | ||
| code740759, code290319 | ||
| code630679, code710739 | ||
| code280289, code360389 | ||
| code760779, code360389 | ||
| code760779, code680709 | ||
| code780799, code460519 | ||
| code240279, code780799 | ||
| code240279, code460519 | ||
| code390459, code780799 | ||
| code760779, code580629 | ||
| diabetes, code240279 | ||
| code630679, code580629 | ||
| code760779, age | ||
| code630679, code290319 | ||
| diabetes, code630679 | ||
| code760779, code290319 | ||
| code760779, code800999 |
| @@ -0,0 +1,68 @@ | ||
| height, gender | ||
| BMI, weight | ||
| age, code240279 | ||
| code390459, sBP | ||
| height, weight | ||
| sBP, dBP | ||
| code390459, diabetes | ||
| code780799, code460519 | ||
| age, code390459 | ||
| code710739, code520579 | ||
| code710739, code800999 | ||
| code460519, code360389 | ||
| code680709, code001139 | ||
| code320359, code290319 | ||
| state, code290319 | ||
| state, code460519 | ||
| code240279, diabetes | ||
| code780799, code580629 | ||
| code710739, code780799 | ||
| code240279, code280289 | ||
| code780799, code280289 | ||
| code710739, code320359 | ||
| code460519, code001139 | ||
| code780799, code360389 | ||
| gender, code630679 | ||
| code140239, age | ||
| code630679, age | ||
| code710739, code680709 | ||
| state, code320359 | ||
| code390459, state | ||
| code390459, code520579 | ||
| code520579, code280289 | ||
| code360389, code001139 | ||
| code280289, code580629 | ||
| age, code710739 | ||
| gender, code710739 | ||
| code520579, code580629 | ||
| code630679, state | ||
| code630679, code580629 | ||
| state, code780799 | ||
| code780799, code520579 | ||
| code240279, code780799 | ||
| diabetes, BMI | ||
| code780799, code680709 | ||
| code460519, code680709 | ||
| code780799, code800999 | ||
| code710739, code740759 | ||
| code390459, code280289 | ||
| code460519, code520579 | ||
| code390459, code240279 | ||
| code780799, code001139 | ||
| code710739, code460519 | ||
| gender, code390459 | ||
| code760779, age | ||
| code680709, code800999 | ||
| code740759, code290319 | ||
| gender, diabetes | ||
| code630679, code140239 | ||
| code630679, code290319 | ||
| code760779, code580629 | ||
| code760779, code360389 | ||
| code630679, code001139 | ||
| code680709, code360389 | ||
| code360389, code580629 | ||
| code800999, code360389 | ||
| code760779, code710739 | ||
| code390459, code290319 | ||
| code760779, code240279 |
| @@ -0,0 +1,73 @@ | ||
| BMI, weight | ||
| dBP, sBP | ||
| height, weight | ||
| code390459, age | ||
| sBP, code390459 | ||
| gender, height | ||
| code390459, diabetes | ||
| code390459, code240279 | ||
| state, code780799 | ||
| state, code290319 | ||
| code680709, code001139 | ||
| code240279, age | ||
| code460519, code520579 | ||
| code710739, code800999 | ||
| code780799, code580629 | ||
| code780799, code710739 | ||
| code390459, state | ||
| state, code460519 | ||
| code460519, code680709 | ||
| code390459, code280289 | ||
| code460519, code780799 | ||
| BMI, gender | ||
| code460519, code001139 | ||
| age, code140239 | ||
| code710739, code320359 | ||
| code390459, code520579 | ||
| code460519, code360389 | ||
| code630679, state | ||
| code780799, code280289 | ||
| code290319, code320359 | ||
| code780799, code680709 | ||
| gender, code630679 | ||
| code780799, code360389 | ||
| code390459, code710739 | ||
| code280289, code580629 | ||
| code680709, code800999 | ||
| code390459, code630679 | ||
| code800999, code360389 | ||
| code520579, code240279 | ||
| code360389, code001139 | ||
| code780799, code800999 | ||
| code630679, age | ||
| gender, code390459 | ||
| gender, code710739 | ||
| diabetes, age | ||
| code630679, code580629 | ||
| code580629, code680709 | ||
| code390459, code290319 | ||
| code290319, code460519 | ||
| code140239, code580629 | ||
| code680709, code360389 | ||
| code630679, code710739 | ||
| code520579, code280289 | ||
| diabetes, code580629 | ||
| code520579, code320359 | ||
| code780799, code001139 | ||
| code780799, code520579 | ||
| code390459, code780799 | ||
| code630679, code290319 | ||
| code240279, diabetes | ||
| code280289, code240279 | ||
| code740759, code290319 | ||
| code760779, code360389 | ||
| code760779, code580629 | ||
| code630679, code001139 | ||
| code760779, code280289 | ||
| gender, diabetes | ||
| code630679, code240279 | ||
| code710739, code680709 | ||
| code290319, code710739 | ||
| code460519, code710739 | ||
| code710739, code520579 | ||
| code760779, code710739 |
| @@ -0,0 +1,77 @@ | ||
| using BayesNets | ||
| using DataFrames | ||
| using Graphs | ||
|
|
||
| ############################################################ | ||
| # Functions | ||
| ############################################################ | ||
|
|
||
| function getParentsAndDomains(node::Int64, b::BayesNet) | ||
| #= | ||
| Returns a list of node's parents, a list of the parents' | ||
| domains, and a boolean indicating whether the node is a | ||
| source node. | ||
| =# | ||
| parents = Int64[] | ||
| pDomains = DiscreteDomain[] | ||
| for edge in b.dag.edges | ||
| if edge.target == node | ||
| push!(parents, edge.source) | ||
| push!(pDomains, b.domains[edge.source]) | ||
| end # if | ||
| end # for edge | ||
| return parents, pDomains, isempty(parents) | ||
| end # function getParentsAndDomains | ||
|
|
||
| ############################################################ | ||
| # Main script | ||
| ############################################################ | ||
|
|
||
| # reconstruct Bayes net object | ||
| inname = ARGS[1] | ||
| title = splitext(inname)[1] | ||
| dataset = readtable(title * ".csv") | ||
|
|
||
| b = BayesNet(names(dataset)) | ||
| b.domains = [DiscreteDomain([x for x in unique(dataset[label])]) | ||
| for label in names(dataset)] | ||
|
|
||
| name2index = b.index | ||
| index2name = Dict{Int64,Symbol}() | ||
| for key in keys(b.index) | ||
| index2name[b.index[key]] = key | ||
| end # for key | ||
|
|
||
| fin = open(title * ".gph", "r") | ||
| lines = readlines(fin) | ||
| close(fin) | ||
| for line in lines | ||
| nodes = split(line, ", ") | ||
| src = convert(Symbol, nodes[1]) | ||
| tgt = convert(Symbol, nodes[2][1:end-1]) | ||
| addEdge!(b, src, tgt) | ||
| end # for line | ||
|
|
||
|
|
||
| # initialize counts (MLE w/ Laplace smoothing) | ||
| counts = Dict() | ||
| for name in b.names | ||
| node = name2index[name] | ||
| parents, pDomains, isSource = getParents(node, b) | ||
|
|
||
| end # for name | ||
|
|
||
|
|
||
| # count all training samples | ||
| samples = array(dataset) | ||
| nSamples = size(samples, 1) | ||
|
|
||
| for distribution in distributions | ||
| for sample = 1:nSamples | ||
|
|
||
| end # for sample | ||
| end # for distribution | ||
|
|
||
|
|
||
| # normalize over all cpds | ||
|
|
| @@ -0,0 +1,20 @@ | ||
| class Node: | ||
| def __init__(self, number): | ||
| self.parents = [] | ||
| self.children = [] | ||
| self.id = number | ||
|
|
||
| def getParents(self): | ||
| return self.parents | ||
|
|
||
| def getChildren(self): | ||
| return self.children | ||
|
|
||
| def addParent(self, node): | ||
| self.parents.append(node) | ||
|
|
||
| def addChild(self, node): | ||
| self.children.append(node) | ||
|
|
||
| def getID(self): | ||
| return self.id |
| @@ -0,0 +1,63 @@ | ||
| import sys | ||
| from probabilityTable import * | ||
|
|
||
| class ParamLearn: | ||
|
|
||
| # countFile = e.g. train2.csv | ||
| def __init__(self, countFile): | ||
| self.pTable = PTable(countFile) | ||
| self.domains = {} | ||
| self.domains["state"] = ["AK","AL","AR","AZ","CA","CO","CT","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"] | ||
| self.domains["gender"] = ["M","F"] | ||
| self.domains["age"] = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95] | ||
| self.domains["height"] = range(40,90) | ||
| self.domains["weight"] = range(0,490,10) | ||
| self.domains["BMI"] = range(0, 170) | ||
| self.domains["sBP"] = range(10, 225, 5) | ||
| self.domains["dBP"] = range(0, 150, 5) | ||
| self.domains["diabetes"] = [0,1] | ||
| self.domains["code001139"] = [0,1] | ||
| self.domains["code140239"] = [0,1] | ||
| self.domains["code240279"] = [0,1] | ||
| self.domains["code280289"] = [0,1] | ||
| self.domains["code290319"] = [0,1] | ||
| self.domains["code320359"] = [0,1] | ||
| self.domains["code360389"] = [0,1] | ||
| self.domains["code390459"] = [0,1] | ||
| self.domains["code460519"] = [0,1] | ||
| self.domains["code520579"] = [0,1] | ||
| self.domains["code580629"] = [0,1] | ||
| self.domains["code630679"] = [0,1] | ||
| self.domains["code680709"] = [0,1] | ||
| self.domains["code710739"] = [0,1] | ||
| self.domains["code740759"] = [0,1] | ||
| self.domains["code760779"] = [0,1] | ||
| self.domains["code780799"] = [0,1] | ||
| self.domains["code800999"] = [0,1] | ||
|
|
||
| def getParentJointProb(self, node_id, node_val, params): | ||
|
|
||
| #parentCount | ||
| parentCount = self.pTable.getCounts(params) | ||
| #posteriorCount | ||
| params[node_id] = node_val | ||
| postCount = self.pTable.getCounts(params) | ||
| x= (postCount+1.0) / (parentCount+len(self.domains[node_id])) | ||
| return x | ||
| def getParentChildJointProb(self, node_id, node_val, parentDict, childDict): | ||
|
|
||
| jointParams = dict(parentDict.items()+childDict.items()) | ||
| # jointCount = self.pTable.getCounts(jointParams) | ||
| # #posteriorCount | ||
| # jointParams[node_id] = node_val | ||
| # postCount = self.pTable.getCounts(jointParams) | ||
| # return postCount*1.0 / (jointCount+0.0000000001) | ||
| x = self.getParentJointProb(node_id, node_val, jointParams) | ||
| return x | ||
|
|
||
| def main(argv): | ||
| countFile = argv[1] | ||
| pLearn = ParamLearn(countFile) | ||
|
|
||
| if __name__ == "__main__": | ||
| main(sys.argv) |
| @@ -0,0 +1,82 @@ | ||
| #from collections import Counter | ||
|
|
||
| import sqlite3 | ||
| class PTable: | ||
|
|
||
| def __init__(self, filename): | ||
| self.conn = sqlite3.connect('example.db') | ||
| c = self.conn.cursor() | ||
| c.execute('''DROP TABLE if EXISTS counts''') | ||
| c.execute('''CREATE TABLE counts | ||
| ( state string, | ||
| gender string, | ||
| age string, | ||
| height string, | ||
| weight string, | ||
| BMI string, | ||
| sBP string, | ||
| dBP string, | ||
| diabetes string, | ||
| code001139 string, | ||
| code140239 string, | ||
| code240279 string, | ||
| code280289 string, | ||
| code290319 string, | ||
| code320359 string, | ||
| code360389 string, | ||
| code390459 string, | ||
| code460519 string, | ||
| code520579 string, | ||
| code580629 string, | ||
| code630679 string, | ||
| code680709 string, | ||
| code710739 string, | ||
| code740759 string, | ||
| code760779 string, | ||
| code780799 string, | ||
| code800999 string)''') | ||
|
|
||
| c.execute('''PRAGMA table_info(counts);''') | ||
| headers = c.fetchall() | ||
| for header in headers: | ||
| c.execute('''CREATE INDEX %s_index ON counts (%s);''' %(header[1], header[1])) | ||
|
|
||
| self.loadTable(filename) | ||
|
|
||
|
|
||
| def loadTable(self, filename): | ||
| fo = open(filename) | ||
| c = self.conn.cursor() | ||
|
|
||
| headers = fo.readline().rstrip() | ||
| headers = headers.split(',') | ||
| self.headers = ", ".join( ["["+str(h)+"]" for h in headers] ) | ||
|
|
||
| line = fo.readline().rstrip() | ||
| while line: | ||
| row = line.split(',') | ||
| c.execute("INSERT INTO counts (%s) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" %self.headers, row) | ||
| line = fo.readline().rstrip() | ||
|
|
||
| c.execute("SELECT count(*) FROM counts") | ||
| print str(c.fetchone()[0]) + " rows loaded" | ||
|
|
||
|
|
||
| def getCounts(self, paramsDict): | ||
| colNames = [] | ||
| colVals = [] | ||
| for key, val in paramsDict.iteritems(): | ||
| colNames.append(key) | ||
| colVals.append(val) | ||
| query = "SELECT count(*) from counts WHERE " | ||
| query += " AND ".join(e + "=?" for e in colNames) | ||
|
|
||
| c = self.conn.cursor() | ||
| return c.execute(query, colVals).fetchone()[0] | ||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
| @@ -0,0 +1,26 @@ | ||
| import unittest | ||
| from paramLearn import * | ||
| from tree import * | ||
|
|
||
| class TestSubmission(unittest.TestCase): | ||
|
|
||
| def test_paramLearn_getChildren(self): | ||
| graph = Tree('testData.gph') | ||
| pLearn = ParamLearn('testData.csv') | ||
| pLearn.domains["sBP"] = [1,0] | ||
| pLearn.domains["code280289"] = [1,0] | ||
| node_id = 'sBP' | ||
| node_val = '1' | ||
| params = {'code280289': '0'} | ||
| self.assertEquals(pLearn.getParentJointProb(node_id, node_val, params), (1.0/2.0)) | ||
|
|
||
| def test_nodeTree(self): | ||
| graph = Tree('testData.gph') | ||
|
|
||
| n = graph.getNode('sBP') | ||
| self.assertEquals(n.id, 'sBP') | ||
| self.assertEquals([p.id for p in n.getChildren()], ['weight', 'dBP']) | ||
| self.assertEquals([p.id for p in n.getParents()], ['code280289']) | ||
|
|
||
| if __name__ == '__main__': | ||
| unittest.main() |
| @@ -0,0 +1,5 @@ | ||
| code280289,sBP,code240279,weight,code780799,code360389,height,code680709,dBP,code320359,BMI,state,code740759,code001139,code460519,code520579,code710739,code760779,code800999,code390459,diabetes,code290319,gender,age,code630679,code580629,code140239 | ||
| 0,1,1,270,0,0,69,0,80,0,41,SD,0,0,0,0,0,0,0,0,0,0,M,60,0,0,1 | ||
| 1,1,1,140,1,0,64,0,60,0,24,VA,0,0,1,0,1,0,0,1,0,0,F,85,0,0,0 | ||
| 0,0,1,220,1,0,68,0,70,0,33,NV,0,0,0,0,1,0,1,0,0,0,M,45,0,0,0 | ||
| 1,0,1,200,0,0,72,0,80,0,27,CA,0,0,1,0,0,0,0,1,0,0,M,60,0,0,0 |
| @@ -0,0 +1,3 @@ | ||
| code280289, sBP | ||
| sBP, weight | ||
| sBP, dBP |
| @@ -0,0 +1,39 @@ | ||
| from node import * | ||
| import sys | ||
|
|
||
| class Tree: | ||
|
|
||
| # edgeFile = e.g. train3.gph | ||
| def __init__(self, edgeFile): | ||
| self.nodeDict = dict() | ||
| self.loadNodes(edgeFile) | ||
|
|
||
| def getNode(self, id_num): | ||
| return self.nodeDict[id_num] | ||
|
|
||
| def retrieveOrCreateNode(self,id_num): | ||
| if id_num in self.nodeDict: | ||
| return self.nodeDict[id_num] | ||
| else: | ||
| self.nodeDict[id_num] = Node(id_num) | ||
| return self.nodeDict[id_num] | ||
|
|
||
| #read in .gph file and populate nodes with data | ||
| def loadNodes(self, edgeFile): | ||
| fo = open(edgeFile) | ||
| line = fo.readline().rstrip() | ||
| while line: | ||
| node_ids = line.split(', ') | ||
| node1 = self.retrieveOrCreateNode(node_ids[0]) | ||
| node2 = self.retrieveOrCreateNode(node_ids[1]) | ||
| node1.addChild(node2) | ||
| node2.addParent(node1) | ||
| line = fo.readline().rstrip() | ||
|
|
||
| def main(argv): | ||
| edgeFile = argv[1] | ||
| tree = Tree(edgeFile) | ||
| print str(len(tree.nodeDict)) + " nodes loaded to tree." | ||
|
|
||
| if __name__ == "__main__": | ||
| main(sys.argv) |
| @@ -0,0 +1,10 @@ | ||
| # Description | ||
| This directory contains all the source code for the project. | ||
|
|
||
| ## Use of Julia files | ||
| 1. ssh into one of the new corn machines on the farm servers (corn01, corn02, corn07, corn10 work best; otherwise use corn-new) | ||
| 2. cd into the right directory, or copy over the files into any directory you want to work in | ||
| 3. from the command line, type in | ||
| - e.g., `julia structLearn.jl train.csv` | ||
| - e.g., `julia evalBayesNet.jl train1.gph train2.gph train3.gph train4.gph train5.gph train6.gph train7.gph train8.gph` | ||
| - e.g., `julia vizBayesNet.jl train1.gph train2.gph train3.gph train4.gph train5.gph train6.gph train7.gph train8.gph` |
| @@ -0,0 +1,74 @@ | ||
| weight,BMI | ||
| height,gender | ||
| diabetes,endocrineNutritionalMetabolicImmunity | ||
| sBP,dBP | ||
| height,BMI | ||
| age,circulatory | ||
| diabetes,age | ||
| gender,weight | ||
| circulatory,sBP | ||
| digestive,other | ||
| respiratory,other | ||
| state,mentalDisorders | ||
| senseOrgans,respiratory | ||
| diabetes,circulatory | ||
| other,state | ||
| musculoskeletal,digestive | ||
| other,genitourinary | ||
| musculoskeletal,injuryPoisoning | ||
| digestive,respiratory | ||
| skinSubcutaneousTissue,infectiousParasitic | ||
| other,blood | ||
| mentalDisorders,nervous | ||
| endocrineNutritionalMetabolicImmunity,blood | ||
| musculoskeletal,nervous | ||
| age,musculoskeletal | ||
| BMI,diabetes | ||
| respiratory,infectiousParasitic | ||
| respiratory,skinSubcutaneousTissue | ||
| musculoskeletal,other | ||
| circulatory,state | ||
| endocrineNutritionalMetabolicImmunity,digestive | ||
| injuryPoisoning,senseOrgans | ||
| musculoskeletal,respiratory | ||
| pregnancyChildbirthPuerperium,age | ||
| gender,pregnancyChildbirthPuerperium | ||
| blood,genitourinary | ||
| other,skinSubcutaneousTissue | ||
| age,neoplasms | ||
| musculoskeletal,mentalDisorders | ||
| injuryPoisoning,skinSubcutaneousTissue | ||
| circulatory,blood | ||
| circulatory,digestive | ||
| gender,musculoskeletal | ||
| digestive,senseOrgans | ||
| pregnancyChildbirthPuerperium,genitourinary | ||
| pregnancyChildbirthPuerperium,state | ||
| injuryPoisoning,other | ||
| digestive,genitourinary | ||
| senseOrgans,infectiousParasitic | ||
| musculoskeletal,senseOrgans | ||
| digestive,blood | ||
| musculoskeletal,skinSubcutaneousTissue | ||
| endocrineNutritionalMetabolicImmunity,other | ||
| skinSubcutaneousTissue,genitourinary | ||
| musculoskeletal,congenitalAnomalies | ||
| circulatory,mentalDisorders | ||
| digestive,nervous | ||
| pregnancyChildbirthPuerperium,mentalDisorders | ||
| injuryPoisoning,respiratory | ||
| other,infectiousParasitic | ||
| neoplasms,digestive | ||
| perinatal,genitourinary | ||
| digestive,injuryPoisoning | ||
| age,endocrineNutritionalMetabolicImmunity | ||
| senseOrgans,skinSubcutaneousTissue | ||
| congenitalAnomalies,mentalDisorders | ||
| circulatory,endocrineNutritionalMetabolicImmunity | ||
| pregnancyChildbirthPuerperium,infectiousParasitic | ||
| perinatal,age | ||
| perinatal,senseOrgans | ||
| perinatal,musculoskeletal | ||
| pregnancyChildbirthPuerperium,other | ||
| perinatal,mentalDisorders | ||
| perinatal,endocrineNutritionalMetabolicImmunity |
| @@ -0,0 +1,35 @@ | ||
| using BayesNets | ||
| using DataFrames | ||
| using Graphs | ||
| using TikzGraphs | ||
| using TikzPictures | ||
|
|
||
| dataset = readtable("train2_translated.csv") | ||
|
|
||
| for arg = 1:length(ARGS) | ||
| arg = 1 | ||
| inname = ARGS[arg] | ||
| title = splitext(inname)[1] | ||
| outname = title * ".pdf" | ||
|
|
||
| b = BayesNet(names(dataset)) | ||
| b.domains = [DiscreteDomain([x for x in unique(dataset[label])]) | ||
| for label in names(dataset)] | ||
|
|
||
| fin = open(inname, "r") | ||
| lines = readlines(fin) | ||
| close(fin) | ||
|
|
||
| for i = 1:length(lines) | ||
| line = lines[i] | ||
| nodes = split(line, ",") | ||
| src = convert(Symbol, nodes[1]) | ||
| tgt = convert(Symbol, nodes[2][1:end-1]) | ||
| addEdge!(b, src, tgt) | ||
| end # for line | ||
|
|
||
| save(b::BayesNet, filename::String) = TikzPictures.save(PDF(filename), TikzGraphs.plot(b.dag, ASCIIString[string(s) for s in b.names])) | ||
| save(b, outname) | ||
| @printf("Output saved to %s\n", outname) | ||
|
|
||
| end # for arg |