diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e4e5f6c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*~
\ No newline at end of file
diff --git a/README.md b/README.md
index cc738b9..6887ba2 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,9 @@
 
 This Julia software accompanies the following paper:
 
-- [Simplicial closure and higher-order link prediction](https://arxiv.org/abs/1802.06916).
+- [Simplicial closure and higher-order link prediction](http://www.pnas.org/content/early/2018/11/08/1800683115).
   Austin R. Benson, Rediet Abebe, Michael T. Schaub, Ali Jadbabaie, and Jon Kleinberg.
-  arXiv:1802.06916, 2018.
+  *Proceedings of the National Academy of Sciences*, 2018.
 
 This tutorial code is not the main software library for simplicial closure and higher-order link prediction, which is [ScHoLP.jl](https://github.com/arbenson/ScHoLP.jl). Instead, the tutorial has the following goals:
 
@@ -16,9 +16,9 @@ This tutorial code is not the main software library for simplicial closure and h
 As discussed above, this tutorial shows how to use the ScHoLP.jl library for higher-order network analysis and reproduction of results. To get the ScHoLP.jl library and start using it in Julia:
 
 ```julia
-Pkg.clone("https://github.com/arbenson/ScHoLP.jl.git")
+import Pkg
+Pkg.add("ScHoLP")
 Pkg.test("ScHoLP")
-using ScHoLP
 ```
 
 Note that ScHoLP.jl has thread-level parallelism available for many features (using Julia's Base.Threads).
@@ -30,6 +30,21 @@ git clone https://github.com/arbenson/ScHoLP-Tutorial.git
 cd ScHoLP-Tutorial
 ```
 
+To run this entire tutorial, you will also need several Julia packages (not all packages are needed for each component; you can add them as necessary).
+
+```julia
+import Pkg
+Pkg.add("CSV")
+Pkg.add("DataFrames")
+Pkg.add("Distributions")
+Pkg.add("FileIO")
+Pkg.add("GLM")
+Pkg.add("HypothesisTests")
+Pkg.add("JLD2")
+Pkg.add("PyCall")
+Pkg.add("ScikitLearn")
+```
+
 ### Data
 
 The package comes with a few example datasets.
@@ -37,7 +52,7 @@ The package comes with a few example datasets.
 ```julia
 using ScHoLP
 ex = example_dataset("example1")  # example from figure 1 of paper
-typeof(ex)  # should be ScHoLP.HONData
+typeof(ex)  # should be HONData
 ex.simplices, ex.nverts, ex.times, ex.name  # components of the data structure
 chs = example_dataset("contact-high-school")  # another dataset
 ```
@@ -52,7 +67,15 @@ ndc_classes = read_txt_data("NDC-classes")
 enron = read_txt_data("email-Enron")
 ```
 
-The collection of datasets from the paper are available from [this web site](http://www.cs.cornell.edu/~arb/data/).
+The collection of datasets from the paper are available from [this web site](http://www.cs.cornell.edu/~arb/data/). You can also download them wholesale and use them as follows.
+
+```bash
+cd ScHoLP-Tutorial/data
+wget https://github.com/arbenson/ScHoLP-Data/archive/1.0.tar.gz
+tar -xzvf 1.0.tar.gz
+gunzip ScHoLP-Data-1.0/*/*.gz
+mv ScHoLP-Data/* .
+```
 
 ### Simplicial closures
 
@@ -145,20 +168,14 @@ Now we can generate scores of the open triangles from the first 80% of the datas
 ```julia
 collect_local_scores(enron)  # scores based on local structural features
 collect_walk_scores(enron)  # scores based on random walks and paths
-collect_Simplicial_PPR_combined_scores(enron)  # scores based on Simplicial PPR
-collect_logreg_supervised_scores(enron)  # scores based on logistic regression supervised method
-```
-
-Since enron is a small dataset, we can afford to decompose the Simplicial PPR scores into the gradient, curl, and harmonic components:
-
-```julia
-collect_Simplicial_PPR_decomposed_scores(enron)
+collect_logreg_supervised_scores(enron)  # scores based on logistic regression
+collect_Simplicial_PPR_decomposed_scores(enron)  # scores based on Simplicial PPR
 ```
 
 We can evaluate how well these methods do compared to random guessing with respect to area under the precision-recall curve. This should reproduce the line for the email-Enron dataset in Table 2 of the paper.
 
 ```julia
-evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "common", "jaccard", "adamic_adar", "proj_graph_PA", "simplex_PA", "UPKatz", "WPKatz", "UPPR", "WPPR", "SimpPPR_comb", "logreg_supervised", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl"])
+evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "common", "jaccard", "adamic_adar", "proj_graph_PA", "simplex_PA", "UPKatz", "WPKatz", "UPPR", "WPPR", "SimpPPR_comb", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl", "logreg_supervised"])
 ```
 
 We can also look at the top predictions made by the algorithms.
@@ -170,21 +187,21 @@ top_predictions(enron, "UPPR", 12)
 This should produce the following output
 
 ```
-1 (0.304908; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; larry.campbell@enron.com
-2 (0.272448; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com
-3 (0.253939; 0): larry.campbell@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com
-4 (0.189741; 0): joe.parks@enron.com; eric.bass@enron.com; dan.hyvl@enron.com
-5 (0.181000; 1): lisa.gang@enron.com; kate.symes@enron.com; bill.williams@enron.com
-6 (0.179424; 0): joe.quenet@enron.com; chris.dorland@enron.com; jeff.king@enron.com
-7 (0.176207; 0): joe.quenet@enron.com; jeff.king@enron.com; fletcher.sturm@enron.com
-8 (0.175591; 1): lisa.gang@enron.com; holden.salisbury@enron.com; kate.symes@enron.com
-9 (0.173161; 1): lisa.gang@enron.com; holden.salisbury@enron.com; bill.williams@enron.com
-10 (0.170872; 0): geir.solberg@enron.com; holden.salisbury@enron.com; kate.symes@enron.com
+1 (0.304992; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; larry.campbell@enron.com
+2 (0.272495; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com
+3 (0.253992; 0): larry.campbell@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com
+4 (0.189678; 0): joe.parks@enron.com; eric.bass@enron.com; dan.hyvl@enron.com
+5 (0.181085; 1): lisa.gang@enron.com; kate.symes@enron.com; bill.williams@enron.com
+6 (0.179377; 0): joe.quenet@enron.com; chris.dorland@enron.com; jeff.king@enron.com
+7 (0.176236; 0): joe.quenet@enron.com; jeff.king@enron.com; fletcher.sturm@enron.com
+8 (0.175624; 1): lisa.gang@enron.com; holden.salisbury@enron.com; kate.symes@enron.com
+9 (0.173160; 1): lisa.gang@enron.com; holden.salisbury@enron.com; bill.williams@enron.com
+10 (0.170947; 0): geir.solberg@enron.com; holden.salisbury@enron.com; kate.symes@enron.com
 11 (0.164845; 0): geir.solberg@enron.com; holden.salisbury@enron.com; bill.williams@enron.com
-12 (0.162414; 0): lisa.gang@enron.com; cara.semperger@enron.com; kate.symes@enron.com
+12 (0.162391; 0): lisa.gang@enron.com; cara.semperger@enron.com; kate.symes@enron.com
 ```
 
-These are the top 12 predictions for the unweighted personalized PageRank scores. The tuple next to the ordered numbers, e.g., (0.304908; 0) in the first line, gives the score function value and a 0/1 indicator of whether or not the open triangle closed in the final 20% of the dataset (1 means that it closed). Here, we see that the triples of nodes with the 5th, 8th, and 9th highest scores simplicially closed.
+These are the top 12 predictions for the unweighted personalized PageRank scores. The tuple next to the ordered numbers, e.g., (0.304992; 0) in the first line, gives the score function value and a 0/1 indicator of whether or not the open triangle closed in the final 20% of the dataset (1 means that it closed). Here, we see that the triples of nodes with the 5th, 8th, and 9th highest scores went through a simplicial closure event.
 
 ### Summary statistics
 
@@ -192,8 +209,10 @@ There is some basic functionality for gathering summary statistics about the dat
 
 ```julia
 chs = example_dataset("contact-high-school")
-basic_summary_statistics(chs)  # prints basic summary statistics (same as Table 1 in paper)
-summary_statistics(chs)  # more advanced statistics --> contact-high-school-statistics.csv
+# print basic summary statistics (same as Table 1 in paper)
+basic_summary_statistics(chs)
+# compute more advanced statistics --> contact-high-school-statistics.csv
+summary_statistics(chs);
 ```
 
 The last command writes several summary statistics to a csv file. For example, "meansimpsize" is the mean number of nodes in each simplex, "projdensity" is the edge density of the projected graph, and "nclosedtri" and "nopentri" are the number of closed and open triangles. The first line of the csv file are the variables, the first line are the statistics for the full dataset and the second line are the statistics for the dataset restricted to only 3-node simplices.
@@ -205,25 +224,25 @@ contact-high-school,327,172035,352718,2,5,32.644895,1.091537e-01,2370,31850,7937
 contact-high-school-3-3,317,7475,22425,3,3,8.305556,5.390728e-02,2091,5721,2126,6378,3.000000,2.362222,1.132810,1.118476e-01,2094,18139
 ```
 
-### Reproducing results
+### Reproducing results in the main text
 
 This section shows how to reproduce results from the paper.
 
-##### Linear models for relationships in Figures 2D and 2F
+##### Linear models for relationships in Figures 2D and 2E
 
 We create linear models for the fraction of triangles in terms of the covariate log average degree (plus an intercept term). The following code snippet produces these models.
 
 ```julia
 # starting from the main directory of tutorial code
 include("statistical_tests_and_models.jl")
-model_fig_2d, model_fig_2f = fracopen_logavedeg_linear_models();
-r2(model_fig_2d)  # roughly 0.38
-r2(model_fig_2f)  # roughly 0.85
+model_fig_2D, model_fig_2E = fracopen_logavedeg_linear_models();
+r2(model_fig_2D)  # roughly 0.38
+r2(model_fig_2E)  # roughly 0.85
 ```
 
-##### Hypothesis tests for strong wedge vs. weak open triangle and strong flap vs. weak open wireframe
+##### Hypothesis tests for fewer strong ties vs. more weaker ties
 
-Here we are testing hypotheses on whether stronger but fewer ties (strong wedge and flap) or weaker but more ties (weak open triangle and wireframe) are more indicative of simplicial closure.
+Here we are testing hypotheses on whether stronger but fewer ties or weaker but more ties are more indicative of simplicial closure.
 
 ```julia
 # starting from the main directory of tutorial code
@@ -237,16 +256,35 @@ simplicial_closure_tests(1e-3)
 
 We saw how to get these numbers in the summary statistics section above. The `basic_summary_statistics()` function produces the numbers.
 
-##### Table 2 (Higher-order link prediction performance)
+##### Table 2 (logistic regression for system domain classification)
+
+Egonet data was collected with the function call `collect_egonet_data(100, 20)` in the file `egonet_analysis.jl`. This takes some time, so we pre-computed the data output and stored it in the directory `output/egonets`. We can reproduce the performance of the logistic regression models with the following code snippet.
+
+```julia
+include("egonet_analysis.jl")
+egonet_predict([LOG_DENSITY, LOG_AVE_DEG, FRAC_OPEN])
+egonet_predict([LOG_AVE_DEG, FRAC_OPEN])
+egonet_predict([LOG_DENSITY, FRAC_OPEN])
+egonet_predict([LOG_DENSITY, LOG_AVE_DEG])
+```
+
+##### Table 3 (Higher-order link prediction performance)
 
 The numbers in this table came from using the higher-order link prediction methods outlined above. Note that some of the score functions are computationally expensive. The necessary julia functions are 
 
 - `collect_labeled_dataset()` to generate the labeled dataset based on an 80/20 split of the data
 - `collect_local_scores()` to generate scores based on local structural features
 - `collect_walk_scores() ` to generate scores based on random walks and paths
-- `collect_Simplicial_PPR_combined_scores()` to generate scores based on simplicial PPR
 - `collect_logreg_supervised_scores()` to generate scores from the supervised learning method
 
+After collecting the data, we can reproduce results in the table with the following commands.
+
+```julia
+include("open_triangle_prediction.jl")
+enron = example_dataset("email-Enron")
+evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "adamic_adar", "proj_graph_PA", "UPKatz", "UPPR", "logreg_supervised"])
+```
+
 ##### Figure 1 (small example of higher-order network)
 
 The example higher-order network in Figure 1 is one of the examples included with the library. Here we show how to list the simplices and compute the weighted projected graph.
@@ -256,18 +294,21 @@ using ScHoLP
 ex_fig1 = example_dataset("example1")
 
 # Print out simplices
-ind = 1
-for (nv, t) in zip(ex_fig1.nverts, ex_fig1.times)
-    simplex = ex_fig1.simplices[ind:(ind + nv - 1)]
-    ind += nv
-    println("$t $simplex")
+function print_simplices()
+	ind = 1
+	for (nv, t) in zip(ex_fig1.nverts, ex_fig1.times)
+    	simplex = ex_fig1.simplices[ind:(ind + nv - 1)]
+	    ind += nv
+	   	println("$t $simplex")
+	end
 end
+print_simplices()
 
 # Get the weighted projected graph
 basic_matrices(ex_fig1)[3]
 ```
 
-##### Figure 2A—B (legend and simplex size distribution)
+##### Figure 2A (simplex size distribution)
 
 Here is a sample code snippet for computing the simplex size distribution for the email-Enron dataset.
 
@@ -284,88 +325,89 @@ for (nv, f) in zip(num_verts, fracs)
 end
 ```
 
-For reproducing the figure, we have pre-computed the distributions in the files `output/simplex-size-dists/*-simplex-size-dist.mat`. The following produces the simplex size distribution and saves the figure.
+For reproducing the figure, we have pre-computed the distributions in the files `output/simplex-size-dists/*-simplex-size-dist.jld2`. The following produces the simplex size distribution and saves the figure.
 
 ```julia
 # starting from the main directory of tutorial code
 include("paper_plots.jl")
-simplex_size_dist_plot()  # produce figures 2AB --> simplex-size-dist.pdf
+# produce figure 2A --> simplex-size-dist.pdf
+simplex_size_dist_plot()
 ```
 
-##### Figure 2C—F (basic dataset structure)
+##### Figure 2B—E (basic dataset structure)
 
 These figures rely on using the `summary_statistics()` function for all of the datasets. For some of the larger datasets, this can take a while. For this tutorial, we include the pre-computed statistics are in the `output/summary-stats/` directory. The following code snippet reproduces the figure.
 
 ```julia
 # starting from the main directory of tutorial code
 include("paper_plots.jl")
-dataset_structure_plots()  # produce figures 2CDEF --> dataset-structure.pdf
+# produce figures 2BCDE --> dataset-structure.pdf
+dataset_structure_plots()
+```
+
+##### Figure 3 (logistic regression decision boundary)
+
+Plot the decision boundary for the logistic regression classifier. 
+
+```julia
+include("paper_plots.jl")
+logreg_decision_boundary()
 ```
 
-##### Figure 2G—H (model simulation)
+##### Figure 4 (model simulation)
 
-These figures require running simulations. Since the simulations are random, the output may not be exactly the same. The following will re-run the simulations and write the results to `simulation.mat`.
+These figures require running simulations. Since the simulations are random, the output may not be exactly the same. The following will re-run the simulations and write the results to `simulation.jld2`.
 
 ```julia
 # starting from the main directory of tutorial code
 include("simulations.jl")
-simulate()  # run the simulations (takes several minutes) --> simulation.mat
+# run the simulations (takes several minutes)
+simulate()  # --> stores in output/simulation/simulation.jld2
 ```
 
-The simulation results used in the paper are stored in `output/simulation/simulation.mat` for convenience. The above code should produce something similar but not exactly the same (due to randomness in the simulation). The following code snippet reproduces figures 2GH.
+The simulation results used in the paper are stored in `output/simulation/simulation.jld2` for convenience. The above code should produce something similar but not exactly the same (due to randomness in the simulation). The following code snippet reproduces figures 2GH.
 
 ```julia
 # starting from the main directory of tutorial code
 include("paper_plots.jl")
-simulation_plots()  # reproduce figures 2GH --> simulation.pdf
+simulation_plot()  # reproduce Figure 4 --> simulation.pdf
 ```
 
-##### Figure 3 (lifecycles)
+##### Figure 5 (lifecycles)
 
-Producing results from Figure 3A uses the `process_lifecycles()` function from the ScHoLP.jl library. Figures 3B—D use the `lifecycle()` function in `lifecycle_analysis.jl`.
+Producing results from Figure 5A uses the `process_lifecycles()` function from the ScHoLP.jl library. Figures 5B—D use the `lifecycle()` function in`lifecycle_analysis.jl`. 
 
 ```julia
-# starting from the main directory of tutorial code
 include("lifecycle_analysis.jl")
-hist = read_txt_data("coauth-MAG-History")  # read dataset from data/coauth-MAG-History directory
-# Get data from Figure 3A
+# read dataset from data/coauth-MAG-History directory
+hist = read_txt_data("coauth-MAG-History")  
+# Get data for Figure 5A (this may take a couple minutes)
 closed_transition_counts, open_transition_counts = process_lifecycles(hist)
 # direct transitions to simplicial closure from each state
 closed_transition_counts[end, 1:(end-1)]
-
-ndc_classes = read_txt_data("NDC-classes")  # read data from data/NDC-classes directory
+# read data from data/NDC-classes directory
+ndc_classes = read_txt_data("NDC-classes")  
 node_labels = read_node_labels("NDC-classes")
-node_labels[[44, 74, 76]]  # nodes in Figure 3B
+node_labels[[44, 74, 76]]  # nodes in Figure 5B
 lifecycle(ndc_classes, 44, 74, 76)
 ```
 
 The simplex labels in the last function call are NDC codes. For example, the first one is 67296-1236. This corresponds to Reyataz as produced by Redpharm Drug, Inc. in 2003, as recorded [here](https://ndclist.com/ndc/67296-1236/package/67296-1236-4).
 
-##### Figure 4 (3-node configuration closure probabilities)
+##### Figure 6 (3-node and 4-node configuration closure probabilities)
 
 This figure is constructed from the simplicial closure probabilities on 3-node configurations. Above, we showed how to compute these. We have pre-computed the probabilities for each dataset in the directory `output/3-node-closures/`.
 
 ```julia
 # starting from the main directory of tutorial code
 include("paper_plots.jl")
-closure_probs_heat_map(3)  # Figure 4A --> closure-probs-scatter-3.pdf
-three_node_scatter_plot()  # Figures 4BCD
+three_node_scatter_plot()  # Figures 6ABC --> closure-prob-scatter-3.pdf
+four_node_scatter_plot()   # Figures 6DEF --> closure-prob-scatter-4.pdf
 ```
 
-##### Figure 5 (4-node configuration closure probabilities)
+##### Figure 7 (generalized means)
 
-Similar to Figure 4, this figure is constructed from simplicial closure probabilities on 4-node configurations. We showed above how to compute these. We have pre-computed the probabilities for each dataset in `output/*-4-node-closures.txt`.
-
-```julia
-# starting from the main directory of tutorial code
-include("paper_plots.jl")
-closure_probs_heat_map(4)  # Figure 5A --> closure-probs-4.pdf
-four_node_scatter_plot()  # Figures 5BCD --> closure-prob-scatter-4.pdf
-```
-
-##### Figure 6 (generalized means)
-
-We first show how to collect the data for generalized means. The following code snippet should produce an output file `prediction-output/email-Enron-open-tris-80-100-genmeans-perf.mat`.
+We first show how to collect the data for generalized means. The following code snippet should produce an output file `prediction-output/email-Enron-open-tris-80-100-genmeans-perf.jld2`.
 
 ```julia
 # starting from the main directory of tutorial code
@@ -380,10 +422,14 @@ We pre-computed the generalized mean scores for all of the datasets in the paper
 ```julia
 # starting from the main directory of tutorial code
 include("paper_plots.jl")
-generalized_means_plot()  # Figure 6 --> generalized-means-perf.pdf
+generalized_means_plot()  # --> generalized-means-perf.pdf
 ```
 
-##### Table 3 (temporal asynchroncity)
+
+
+### Reproduce results in the supplementary material
+
+##### Table S1 (temporal asynchroncity)
 
 To measure temporal asynchroncity in the datasets, we look at the number of "active interval" overlaps in the open triangles. The active interval is the time interval corresponding to the interval of time between the first and last simplices (in time) containing the two nodes.
 
@@ -400,7 +446,7 @@ dataset & # open triangles & 0 overlaps & 1 overlap & 2 overlaps & 3 overlaps
 email-Enron & 3317 & 0.008 & 0.130 & 0.151 & 0.711
 ```
 
-##### Table 4 (dependence of tie strength and edge density at different points in time)
+##### Table S2 (dependence of tie strength and edge density at different points in time)
 
 The results from this table just use the core ScHoLP.jl functionality and the same function we saw above for the simplicial closure probabilities. We just provide an extra input parameter to the function `closure_type_counts3()` for pre-filtering the dataset to just start with the first X% of timestamped simplices.
 
@@ -423,7 +469,7 @@ for X in [40, 60, 80, 100]
 end
 ```
 
-##### Table 5 (Simplicial closure probabilities at different points in time)
+##### Table S3 (Simplicial closure probabilities at different points in time)
 
 In describing how to reproduce Table S2, we showed how to get the closure probabilities at different points in time. The following code snippet prints out some of the statistics for other datasets, which are pre-computed and stored in the `output/3-node-closures/` directory.
 
@@ -445,23 +491,74 @@ closure_stats_over_time("DAWN")
 closure_stats_over_time("tags-stack-overflow")
 ```
 
-##### Table 6 (4-node configuration reference figures)
+##### Table S5 (extra results from models)
+
+Example to get all of the results form the Enron dataset.
+
+```julia
+include("open_triangle_prediction.jl")
+enron = read_txt_data("email-Enron")  # read from data/email-Enron directory
+collect_labeled_dataset(enron)
+collect_local_scores(enron)  # scores based on local structural features
+collect_walk_scores(enron)  # scores based on random walks and paths
+collect_logreg_supervised_scores(enron)  # scores based on logistic regression
+collect_Simplicial_PPR_decomposed_scores(enron)  # scores based on Simplicial PPR
+evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "common", "jaccard", "adamic_adar", "proj_graph_PA", "simplex_PA", "UPKatz", "WPKatz", "UPPR", "WPPR", "SimpPPR_comb", "logreg_supervised"])
+```
+
+##### Table S6 (extra results from the Hodge decomposition)
 
-This table is just for illustration and does not present computational results.
+This table shows the results from using the Hodge decomposition to further decompose the simplicial personalized PageRank scores. Note that this software uses the newer normalization method described in the following paper:
 
-##### Table 7 (extra results from the Hodge decomposition)
+- [Random walks on simplicial complexes and the normalized Hodge Laplacian](https://arxiv.org/abs/1807.05044). Michael T. Schaub, Austin R. Benson, Paul Horn, Gabor Lippner, and Ali Jadbabaie. *arXiv:1807.05044*, 2018.
 
-This table shows the results from using the Hodge decomposition to further decompose the simplicial personalized PageRank scores. Here is how one would reproduce the line for the NDC-classes dataset (numbers may be slightly different due to randomness).
+Here is how one would reproduce the line for the NDC-classes dataset.
 
 ```julia
 # starting from the main directory of tutorial code
 include("open_triangle_prediction.jl")
-ndc_classes = read_txt_data("NDC-classes")  # read data from data/NDC-classes directory
-collect_labeled_dataset(ndc_classes)  # collect the data from the 80/20 split
-collect_Simplicial_PPR_decomposed_scores(ndc_classes)  # collect scores
-evaluate(ndc_classes, ["SimpPPR_comb", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl"]) # print relative scores
+# read data from data/NDC-classes directory
+ndc_classes = read_txt_data("NDC-classes")  
+# collect the data from the 80/20 split
+collect_labeled_dataset(ndc_classes)  
+# collect scores
+collect_Simplicial_PPR_decomposed_scores(ndc_classes)  
+# print relative scores
+evaluate(ndc_classes, ["SimpPPR_comb", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl"]) 
+```
+
+##### Table S7 (output predictions)
+
+We showed how to look at the top predictions in the higher-order link prediction section above. Here is the specific command to reproduce Table S7.
+
+```julia
+include("open_triangle_prediction.jl")
+dawn = read_txt_data("DAWN")  # need to download DAWN data to data/ directory
+collect_labeled_dataset(dawn)
+collect_local_scores(dawn)
+top_predictions(dawn, "adamic_adar", 25)
+```
+
+##### Figure S1 (heat map of 3-node closures)
+
+```julia
+include("paper_plots.jl")
+closure_probs_heat_map(3)
+```
+
+##### Figure S2 (heat map of 4-node closures)
+
+```julia
+include("paper_plots.jl")
+closure_probs_heat_map(3)
 ```
 
-##### Table S8 (output predictions)
+##### Figure S3 (heat map of 3-node closures at different points in time)
+
+```julia
+include("paper_plots.jl")
+for X in [40, 60, 80, 100]
+	closure_probs_heat_map(3, X)
+end
+```
 
-We showed how to look at the top predictions in the higher-order link prediction section above.
diff --git a/common.jl b/common.jl
index 8213ca5..62faee9 100644
--- a/common.jl
+++ b/common.jl
@@ -1,10 +1,28 @@
 using Base.Threads
 using Combinatorics
+using DelimitedFiles
+using FileIO
+using JLD2
+using Random
 using ScHoLP
+using SparseArrays
 using StatsBase
 
+const NUM_FEATS = 3
+const LOG_AVE_DEG = 1
+const LOG_DENSITY = 2
+const FRAC_OPEN   = 3
+
 function read_txt_data(dataset::String)
-    read(filename::String) = convert(Vector{Int64}, readdlm(filename, Int64)[:, 1])
+    function read(filename::String)
+        ret = Int64[]
+        open(filename) do f
+            for line in eachline(f)
+                push!(ret, parse(Int64, line))
+            end
+        end
+        return ret
+    end
     return HONData(read("data/$(dataset)/$(dataset)-simplices.txt"),
                    read("data/$(dataset)/$(dataset)-nverts.txt"),
                    read("data/$(dataset)/$(dataset)-times.txt"),
@@ -40,13 +58,39 @@ function read_closure_stats(dataset::String, simplex_size::Int64, initial_cutoff
     end
     for row_ind in 1:size(data, 1)
         row = convert(Vector{Int64}, data[row_ind, :])
-        push!(keys, (row[1:simplex_size]...))
+        push!(keys, tuple(row[1:simplex_size]...))
         push!(nsamples, row[end - 1])
         push!(nclosed, row[end])
     end
     return (keys, nsamples, nclosed)
 end
 
+function egonet_train_test_data(trial::Int64)
+    Random.seed!(444)  # for reproducibility
+    data = load("output/egonets/egonet-data-$trial.jld2")
+    X = data["X"]
+    y = data["labels"]
+    yf = data["full_labels"]
+    inds = randperm(length(y))
+    X = X[inds, :]
+    y = y[inds]
+    yf = yf[inds]
+    
+    train_inds = Int64[]
+    test_inds  = Int64[]
+    for label in sort(unique(y))
+        inds = findall(y .== label)
+        end_ind = convert(Int64, round(length(inds) * 0.8))
+        append!(train_inds, inds[1:end_ind])
+        append!(test_inds,  inds[(end_ind + 1):end])
+    end
+    
+    X_train, X_test = X[train_inds, :], X[test_inds, :]
+    y_train, y_test = y[train_inds], y[test_inds]
+    yf_train, yf_test = yf[train_inds], yf[test_inds]    
+    return (X_train, X_test, y_train, y_test, yf_train, yf_test)
+end
+
 # This is just a convenient wrapper around all of the formatting parameters for
 # making plots.
 function all_datasets_params()
@@ -75,3 +119,4 @@ function all_datasets_params()
                    ]
     return plot_params
 end
+;
diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..d64b504
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1,14 @@
+DAWN
+NDC-substances
+coauth-DBLP
+coauth-MAG-Geology
+congress-bills
+contact-high-school
+contact-primary-school
+email-Eu
+tags-ask-ubuntu
+tags-math-sx
+tags-stack-overflow
+threads-ask-ubuntu
+threads-math-sx
+threads-stack-overflow
diff --git a/egonet_analysis.jl b/egonet_analysis.jl
new file mode 100644
index 0000000..7fb1885
--- /dev/null
+++ b/egonet_analysis.jl
@@ -0,0 +1,149 @@
+include("common.jl")
+
+using DataFrames
+using GLM
+using Printf
+using Random
+using SparseArrays
+using Statistics
+
+using ScikitLearn
+@sk_import linear_model: LogisticRegression
+
+# Construct HONData for a given ego
+function egonet_dataset(dataset::HONData, ego::Int64, B::SpIntMat)
+    in_egonet = zeros(Bool, size(B, 1))
+    in_egonet[ego] = true
+    in_egonet[findnz(B[:, ego])[1]] .= true
+
+    node_map = Dict{Int64, Int64}()
+    function get_key(x::Int64)
+        if haskey(node_map, x); return node_map[x]; end
+        n = length(node_map) + 1
+        node_map[x] = n
+        return n
+    end
+    ego_key = get_key(ego)
+    
+    new_simplices = Int64[]
+    new_nverts = Int64[]
+    new_times = Int64[]
+    curr_ind = 1
+    for (nvert, time) in zip(dataset.nverts, dataset.times)
+        end_ind = curr_ind + nvert - 1
+        simplex = dataset.simplices[curr_ind:end_ind]
+        curr_ind += nvert
+        simplex_in_egonet = [v for v in simplex if in_egonet[v]]
+        if length(simplex_in_egonet) > 0
+            mapped_simplex = [get_key(v) for v in simplex_in_egonet]
+            append!(new_simplices, mapped_simplex)
+            push!(new_nverts, length(mapped_simplex))
+            push!(new_times, time)
+        end
+    end
+
+    return HONData(new_simplices, new_nverts, new_times, "egonet")
+end
+
+function egonet_stats(dataset_name::String, num_egos::Int64)
+    # read data
+    dataset = read_txt_data(dataset_name)
+    A1, At1, B1 = basic_matrices(dataset.simplices, dataset.nverts)
+    
+    # Get eligible egos
+    n = size(B1, 1)
+    tri_order = proj_graph_degree_order(B1)
+    in_tri = zeros(Int64, n, Threads.nthreads())
+    Threads.@threads for i = 1:n
+        for (j, k) in neighbor_pairs(B1, tri_order, i)
+            if B1[j, k] > 0
+                tid = Threads.threadid()
+                in_tri[[i, j, k], tid] .= 1
+            end
+        end
+    end
+    eligible_egos = findall(vec(sum(in_tri, dims=2)) .> 0)
+    num_eligible = length(eligible_egos)
+    println("$num_eligible eligible egos")
+    
+    # Sample from eligible egos
+    sampled_egos =
+        eligible_egos[StatsBase.sample(1:length(eligible_egos),
+                                       num_egos, replace=false)]
+
+    # Collect statistics
+    X = zeros(Float64, NUM_FEATS, length(sampled_egos))
+    for (j, ego) in enumerate(sampled_egos)
+        print(stdout, "$j \r")
+        flush(stdout)
+        egonet = egonet_dataset(dataset, ego, B1)
+        A, At, B = basic_matrices(egonet.simplices, egonet.nverts)
+
+        num_nodes = sum(sum(At, dims=1) .> 0)
+        no, nc = num_open_closed_triangles(A, At, B)
+
+        # log average degree
+        X[LOG_AVE_DEG, j] = log.(nnz(B) / num_nodes)
+        # log edge density
+        X[LOG_DENSITY, j] = log.(nnz(B) / (num_nodes^2 - num_nodes))
+        # frac. open tris
+        X[FRAC_OPEN, j] = no / (no + nc)
+    end
+    
+    return convert(SpFltMat, X')
+end
+
+function collect_egonet_data(num_egos::Int64, trial::Int64)
+    Random.seed!(1234 * trial)  # reproducibility
+    dataset_names = [row[1] for row in all_datasets_params()]
+    ndatasets = length(dataset_names)
+    X = zeros(Float64, 0, NUM_FEATS)
+    labels = Int64[]
+    for (ind, dname) in enumerate(dataset_names)
+        println("$dname...")
+        label = nothing
+        if     (dname == "coauth-DBLP" ||
+                dname == "coauth-MAG-Geology" ||
+                dname == "coauth-MAG-History");      label = 0;
+        elseif (dname == "tags-stack-overflow" ||
+                dname == "tags-math-sx"        ||
+                dname == "tags-ask-ubuntu");         label = 1;
+        elseif (dname == "threads-stack-overflow" ||
+                dname == "threads-math-sx"        ||
+                dname == "threads-ask-ubuntu");      label = 2;
+        elseif (dname == "contact-high-school" ||
+                dname == "contact-primary-school");  label = 3;
+        elseif (dname == "email-Eu" ||
+                dname == "email-Enron");             label = 4;
+        end
+        if label != nothing
+            X = [X; egonet_stats(dname, num_egos)]
+            append!(labels, ones(Int64, num_egos) * label)
+        end
+    end
+    save("output/egonets/egonet-data-$trial.jld2",
+         Dict("X" => X, "labels" => labels))
+end
+
+function egonet_predict(feat_cols::Vector{Int64})
+    accs_mlr = Float64[]
+    accs_rnd = Float64[]
+
+    for trial in 1:20
+        (X_train, X_test, y_train, y_test) = egonet_train_test_data(trial)[1:4]
+        @show typeof(X_train)
+        @show typeof(X_test)
+        @show typeof(y_train)
+        @show typeof(y_test)        
+        model = LogisticRegression(fit_intercept=true, multi_class="multinomial",
+                                   C=10, solver="newton-cg", max_iter=10000)
+        ScikitLearn.fit!(model, X_train, y_train)
+        rand_prob =
+            sum([(sum(y_train .== l) / length(y_train))^2 for l in unique(y_train)])
+        push!(accs_mlr, ScikitLearn.score(model, X_test, y_test))
+        push!(accs_rnd, rand_prob)
+    end
+
+    @printf("%0.2f +/- %0.2f\n", mean(accs_mlr), std(accs_mlr))
+    @printf("%0.2f +/- %0.2f\n", mean(accs_rnd), std(accs_rnd))
+end
diff --git a/lifecycle_analysis.jl b/lifecycle_analysis.jl
index 7f6a6d0..f8d83a6 100644
--- a/lifecycle_analysis.jl
+++ b/lifecycle_analysis.jl
@@ -87,3 +87,4 @@ function lifecycle(dataset::HONData, u::Int64, v::Int64, w::Int64)
         println("$simplex_name: $simplex_nodes")
     end
 end
+;
diff --git a/open_triangle_prediction.jl b/open_triangle_prediction.jl
index 4b68ba2..bf33e16 100644
--- a/open_triangle_prediction.jl
+++ b/open_triangle_prediction.jl
@@ -2,9 +2,9 @@ include("common.jl")
 
 using ScHoLP
 
-using Combinatorics
-using MAT
-using PyCall, JLD, PyCallJLD
+using LinearAlgebra
+using Printf
+using PyCall
 
 using ScikitLearn
 @sk_import linear_model: LogisticRegression
@@ -14,12 +14,12 @@ const OUTDIR = "prediction-output"
 basename_str(dataset::String) = "$(OUTDIR)/$dataset-open-tris-80-100"
 
 function read_data(dataset::HONData, prcntl1::Int64, prcntl2::Int64)
-    fname = "$(OUTDIR)/$(dataset.name)-open-tris-$prcntl1-$prcntl2.mat"
-    data = matread(fname)["data"]
+    fname = "$(OUTDIR)/$(dataset.name)-open-tris-$prcntl1-$prcntl2.jld2"
+    data = load(fname)["data"]
     dataT = data'
     ntri = size(dataT, 2)
-    triangles = Vector{NTuple{3,Int64}}(ntri)
-    labels = Vector{Int64}(ntri)
+    triangles = Vector{NTuple{3,Int64}}(undef, ntri)
+    labels = Vector{Int64}(undef, ntri)
     for i in 1:ntri
         triangles[i] = (dataT[1, i], dataT[2, i], dataT[3, i])
         labels[i] = dataT[4, i]
@@ -29,13 +29,12 @@ end
 
 function write_scores(dataset::HONData, score_type::String, scores::Vector{Float64})
     basename = basename_str(dataset.name)
-    matwrite("$basename-scores-$score_type.mat",
-             Dict("scores" => scores))
+    save("$basename-scores-$score_type.jld2", Dict("scores" => scores))
 end
 
 function read_scores(dataset::HONData, score_type::String)
     basename = basename_str(dataset.name)    
-    data = matread("$basename-scores-$score_type.mat")
+    data = load("$basename-scores-$score_type.jld2")
     return convert(Vector{Float64}, data["scores"])
 end
 
@@ -54,11 +53,11 @@ function collect_local_scores(dataset::HONData)
     println("geometric mean...")
     write_scores(dataset, "geom_mean", geometric_mean(triangles, B))
 
-    degrees = vec(sum(spones(B), 1))
+    degrees = vec(sum(make_sparse_ones(B), dims=1))
     println("projected graph preferential attachment...")
     write_scores(dataset, "proj_graph_PA", pref_attach3(triangles, degrees))
     
-    simp_degrees = vec(sum(At, 1))
+    simp_degrees = vec(sum(At, dims=1))
     println("simplex preferential attachment...")    
     write_scores(dataset, "simplex_PA", pref_attach3(triangles, simp_degrees))
     
@@ -85,29 +84,29 @@ function collect_walk_scores(dataset::HONData)
     println("Unweighted personalized Katz...")
     scores, S = PKatz3(triangles, B, true, dense_solve)
     write_scores(dataset, "UPKatz", scores)
-    matwrite("$basename-UPKatz.mat", Dict("S" => S))
+    save("$basename-UPKatz.jld2", Dict("S" => S))
 
     println("Weighted personalized Katz...")    
     scores, S = PKatz3(triangles, B, false, dense_solve)    
     write_scores(dataset, "WPKatz", scores)
-    matwrite("$basename-WPKatz.mat", Dict("S" => S))
+    save("$basename-WPKatz.jld2", Dict("S" => S))
 
     println("Unweighted personalized PageRank...")
     scores, S = PPR3(triangles, B, true, dense_solve)
     write_scores(dataset, "UPPR", scores)
-    matwrite("$basename-UPPR.mat", Dict("S" => S))
+    save("$basename-UPPR.jld2", Dict("S" => S))
 
     println("Weighted personalized PageRank...")    
     scores, S = PPR3(triangles, B, false, dense_solve)    
     write_scores(dataset, "WPPR", scores)
-    matwrite("$basename-WPPR.mat", Dict("S" => S))
+    save("$basename-WPPR.jld2", Dict("S" => S))
 end
 
 function collect_logreg_supervised_scores(dataset::HONData)
     function feature_matrix(triangles::Vector{NTuple{3,Int64}},
                             At::SpIntMat, B::SpIntMat)
-        degrees = vec(sum(spones(B), 1))
-        simp_degrees = vec(sum(At, 1))
+        degrees = vec(sum(make_sparse_ones(B), dims=1))
+        simp_degrees = vec(sum(At, dims=1))
         common_nbrs = common_neighbors_map(B, triangles)
         ntriangles = length(triangles)
         X = zeros(Float64, 26, ntriangles)
@@ -124,9 +123,9 @@ function collect_logreg_supervised_scores(dataset::HONData)
             X[12, ind] = length(common_jk)
             X[13, ind] = length(intersect(common_ij, common_ik, common_jk))
             X[14:22, ind] = log.(X[1:9, ind])
-            X[23:26, ind] = log.(X[10:13, ind] + 1.0)
+            X[23:26, ind] = log.(X[10:13, ind] .+ 1.0)
         end
-        return X'
+        return Matrix(X')
     end
     
     triangles = read_data(dataset, 80, 100)[1]
@@ -141,9 +140,8 @@ function collect_logreg_supervised_scores(dataset::HONData)
     train_simplices, train_nverts = split_data(simplices, nverts, times, 60, 80)[1:2]
     At_train, B_train = basic_matrices(train_simplices, train_nverts)[2:3]
     X_train = feature_matrix(train_triangles, At_train, B_train)
-    model = LogisticRegression(fit_intercept=true)
+    model = LogisticRegression(fit_intercept=true, solver="liblinear")
     ScikitLearn.fit!(model, X_train, val_labels)
-    JLD.save("$basename-LR-model.jld", "model", model)
     X = feature_matrix(triangles, At, B)
     learned_scores = ScikitLearn.predict_proba(model, X)[:, 2]
     write_scores(dataset, "logreg_supervised", learned_scores)
@@ -156,10 +154,10 @@ function collect_Simplicial_PPR_combined_scores(dataset::HONData)
     A = basic_matrices(old_simplices, old_nverts)[1]
     basename = basename_str(dataset.name)    
 
-    (scores_comb, S_comb, edge_map) = Simplicial_PPR3_combined(triangles, A, 0.85)
+    (scores_comb, S_comb, edge_map) = Simplicial_PPR3_combined(triangles, A, true, 0.85)
     write_scores(dataset, "SimpPPR_comb", scores_comb)
-    matwrite("$basename-SimpPPR_comb.mat",
-             Dict("S" => S_comb, "edge_map" => edge_map))
+    save("$basename-SimpPPR_comb.jld2",
+         Dict("S" => S_comb, "edge_map" => edge_map))
 end
 
 function collect_Simplicial_PPR_decomposed_scores(dataset::HONData)
@@ -171,19 +169,19 @@ function collect_Simplicial_PPR_decomposed_scores(dataset::HONData)
     
     (scores_comb, scores_curl, scores_grad, scores_harm,
      S_comb,      S_curl,      S_grad,      S_harm, edge_map) =
-         Simplicial_PPR3_decomposed(triangles, A, false, 0.85)
+         Simplicial_PPR3_decomposed(triangles, A, true, 0.85)
     write_scores(dataset, "SimpPPR_comb", scores_comb)
     write_scores(dataset, "SimpPPR_grad", scores_grad)
     write_scores(dataset, "SimpPPR_curl", scores_curl)
-    write_scores(dataset, "SimpPPR_harm", scores_harm)            
-    matwrite("$basename-SimpPPR_comb.mat",
-             Dict("S" => S_comb, "edge_map" => edge_map))
-    matwrite("$basename-SimpPPR_grad.mat",
-             Dict("S" => S_grad, "edge_map" => edge_map))
-    matwrite("$basename-SimpPPR_curl.mat",
-             Dict("S" => S_curl, "edge_map" => edge_map))
-    matwrite("$basename-SimpPPR_harm.mat",
-             Dict("S" => S_harm, "edge_map" => edge_map))
+    write_scores(dataset, "SimpPPR_harm", scores_harm)           
+    save("$basename-SimpPPR_comb.jld2",
+         Dict("S" => S_comb, "edge_map" => edge_map))
+    save("$basename-SimpPPR_grad.jld2",
+         Dict("S" => S_grad, "edge_map" => edge_map))
+    save("$basename-SimpPPR_curl.jld2",
+         Dict("S" => S_curl, "edge_map" => edge_map))
+    save("$basename-SimpPPR_harm.jld2",
+         Dict("S" => S_harm, "edge_map" => edge_map))
 end
 
 function collect_generalized_means(dataset::HONData)
@@ -205,21 +203,20 @@ function collect_generalized_means(dataset::HONData)
         push!(improvements, improvement)
         println("($p): $improvement")
     end
-    matwrite("$basename-genmeans-perf.mat",
-             Dict("improvements" => improvements, "ps" => ps))
+    save("$basename-genmeans-perf.jld2",
+         Dict("improvements" => improvements, "ps" => ps))
     return (ps, improvements)
 end
 
 function evaluate(dataset::HONData, score_types::Vector{String})
     triangles, labels = read_data(dataset, 80, 100)
     rand_rate = sum(labels .== 1) / length(labels)
-    println(@sprintf("random: %0.2e", rand_rate))
+    @printf("random: %0.2e\n", rand_rate)
     for score_type in score_types
         scores = read_scores(dataset, score_type)
-        assert(length(labels) == length(scores))
         ave_prec = average_precision_score(labels, scores)
         improvement = ave_prec / rand_rate
-        println(@sprintf("%s: %0.2f", score_type, improvement))
+        @printf("%s: %0.2f\n", score_type, improvement)
     end
 end
 
@@ -238,14 +235,14 @@ Input parameters:
 """
 function top_predictions(dataset::HONData, score_type::String, topk::Int64=10)
     triangles, labels = read_data(dataset, 80, 100)
-    scores = read_scores(dataset, score_type)    
+    scores = read_scores(dataset, score_type)
     sp = sortperm(scores, alg=QuickSort, rev=true)
     node_labels = read_node_labels(dataset.name)
     for rank = 1:topk
         ind = sp[rank]
         i, j, k = triangles[ind]
-        println(@sprintf("%d (%f; %d): %s; %s; %s", rank, scores[ind],
-                         labels[ind], node_labels[i], node_labels[j], node_labels[k]))
+        @printf("%d (%f; %d): %s; %s; %s\n", rank, scores[ind],
+                labels[ind], node_labels[i], node_labels[j], node_labels[k])
     end
 end
 
@@ -273,8 +270,8 @@ function collect_labeled_dataset(dataset::HONData)
             output_data[4, i] = (tri in new_closed_tris)
         end
         basename = basename_str(dataset.name)
-        matwrite("$(OUTDIR)/$(dataset.name)-$(output_name).mat",
-                 Dict("data" => output_data'))
+        save("$(OUTDIR)/$(dataset.name)-$(output_name).jld2",
+             Dict("data" => output_data'))
     end
     
     old_simplices, old_nverts, new_simplices, new_nverts =
diff --git a/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..db88256
Binary files /dev/null and b/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index e70d8f8..0000000
Binary files a/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..42b7cb8
Binary files /dev/null and b/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 8cfa7b3..0000000
Binary files a/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..4a155e2
Binary files /dev/null and b/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index b8d89c6..0000000
Binary files a/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..2d1c5c0
Binary files /dev/null and b/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 27d61d4..0000000
Binary files a/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..cb5a83d
Binary files /dev/null and b/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index a0a6806..0000000
Binary files a/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..c6d3b7d
Binary files /dev/null and b/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 2dd1acc..0000000
Binary files a/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..1e0be9f
Binary files /dev/null and b/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index bbfa0cd..0000000
Binary files a/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/congress-committees-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/congress-committees-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 2a51ca6..0000000
Binary files a/output/generalized-means/congress-committees-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..1fa89a2
Binary files /dev/null and b/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index dbfaa58..0000000
Binary files a/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..23ea57f
Binary files /dev/null and b/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 7a47ade..0000000
Binary files a/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..1bde10b
Binary files /dev/null and b/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 965e7c5..0000000
Binary files a/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..7d0e71e
Binary files /dev/null and b/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 02fb972..0000000
Binary files a/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/music-rap-genius-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/music-rap-genius-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 5ab2b2b..0000000
Binary files a/output/generalized-means/music-rap-genius-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..6907699
Binary files /dev/null and b/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index dec4848..0000000
Binary files a/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..0c67299
Binary files /dev/null and b/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index cbb24fc..0000000
Binary files a/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..bf9a006
Binary files /dev/null and b/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 973148d..0000000
Binary files a/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..4a3199f
Binary files /dev/null and b/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index b00e28f..0000000
Binary files a/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..cd6f47c
Binary files /dev/null and b/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 303ade6..0000000
Binary files a/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.jld2
new file mode 100644
index 0000000..9aa1dbc
Binary files /dev/null and b/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.jld2 differ
diff --git a/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.mat
deleted file mode 100644
index 9fda3a9..0000000
Binary files a/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.mat and /dev/null differ
diff --git a/output/simplex-size-dists/DAWN-simplex-size-dist.jld2 b/output/simplex-size-dists/DAWN-simplex-size-dist.jld2
new file mode 100644
index 0000000..bc3c3a2
Binary files /dev/null and b/output/simplex-size-dists/DAWN-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/DAWN-simplex-size-dist.mat b/output/simplex-size-dists/DAWN-simplex-size-dist.mat
deleted file mode 100644
index c20c1b6..0000000
Binary files a/output/simplex-size-dists/DAWN-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/NDC-classes-simplex-size-dist.jld2 b/output/simplex-size-dists/NDC-classes-simplex-size-dist.jld2
new file mode 100644
index 0000000..578a487
Binary files /dev/null and b/output/simplex-size-dists/NDC-classes-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/NDC-classes-simplex-size-dist.mat b/output/simplex-size-dists/NDC-classes-simplex-size-dist.mat
deleted file mode 100644
index 376a45b..0000000
Binary files a/output/simplex-size-dists/NDC-classes-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/NDC-substances-simplex-size-dist.jld2 b/output/simplex-size-dists/NDC-substances-simplex-size-dist.jld2
new file mode 100644
index 0000000..7d24aef
Binary files /dev/null and b/output/simplex-size-dists/NDC-substances-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/NDC-substances-simplex-size-dist.mat b/output/simplex-size-dists/NDC-substances-simplex-size-dist.mat
deleted file mode 100644
index 680a207..0000000
Binary files a/output/simplex-size-dists/NDC-substances-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.jld2 b/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.jld2
new file mode 100644
index 0000000..140a0dc
Binary files /dev/null and b/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.mat b/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.mat
deleted file mode 100644
index d3e0649..0000000
Binary files a/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.jld2 b/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.jld2
new file mode 100644
index 0000000..9ada526
Binary files /dev/null and b/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.mat b/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.mat
deleted file mode 100644
index 303676e..0000000
Binary files a/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.jld2 b/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.jld2
new file mode 100644
index 0000000..7b3939b
Binary files /dev/null and b/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.mat b/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.mat
deleted file mode 100644
index 5c5ed6f..0000000
Binary files a/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/congress-bills-simplex-size-dist.jld2 b/output/simplex-size-dists/congress-bills-simplex-size-dist.jld2
new file mode 100644
index 0000000..482c274
Binary files /dev/null and b/output/simplex-size-dists/congress-bills-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/congress-bills-simplex-size-dist.mat b/output/simplex-size-dists/congress-bills-simplex-size-dist.mat
deleted file mode 100644
index c3a8f48..0000000
Binary files a/output/simplex-size-dists/congress-bills-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/congress-committees-simplex-size-dist.mat b/output/simplex-size-dists/congress-committees-simplex-size-dist.mat
deleted file mode 100644
index f58cceb..0000000
Binary files a/output/simplex-size-dists/congress-committees-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/contact-high-school-simplex-size-dist.jld2 b/output/simplex-size-dists/contact-high-school-simplex-size-dist.jld2
new file mode 100644
index 0000000..6cac696
Binary files /dev/null and b/output/simplex-size-dists/contact-high-school-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/contact-high-school-simplex-size-dist.mat b/output/simplex-size-dists/contact-high-school-simplex-size-dist.mat
deleted file mode 100644
index 6638b3e..0000000
Binary files a/output/simplex-size-dists/contact-high-school-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/contact-primary-school-simplex-size-dist.jld2 b/output/simplex-size-dists/contact-primary-school-simplex-size-dist.jld2
new file mode 100644
index 0000000..c23ea93
Binary files /dev/null and b/output/simplex-size-dists/contact-primary-school-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/contact-primary-school-simplex-size-dist.mat b/output/simplex-size-dists/contact-primary-school-simplex-size-dist.mat
deleted file mode 100644
index 279c888..0000000
Binary files a/output/simplex-size-dists/contact-primary-school-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/email-Enron-simplex-size-dist.jld2 b/output/simplex-size-dists/email-Enron-simplex-size-dist.jld2
new file mode 100644
index 0000000..c8e8de4
Binary files /dev/null and b/output/simplex-size-dists/email-Enron-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/email-Enron-simplex-size-dist.mat b/output/simplex-size-dists/email-Enron-simplex-size-dist.mat
deleted file mode 100644
index b94f798..0000000
Binary files a/output/simplex-size-dists/email-Enron-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/email-Eu-simplex-size-dist.jld2 b/output/simplex-size-dists/email-Eu-simplex-size-dist.jld2
new file mode 100644
index 0000000..15736a1
Binary files /dev/null and b/output/simplex-size-dists/email-Eu-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/email-Eu-simplex-size-dist.mat b/output/simplex-size-dists/email-Eu-simplex-size-dist.mat
deleted file mode 100644
index b4554a8..0000000
Binary files a/output/simplex-size-dists/email-Eu-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/music-rap-genius-simplex-size-dist.mat b/output/simplex-size-dists/music-rap-genius-simplex-size-dist.mat
deleted file mode 100644
index 2f8730d..0000000
Binary files a/output/simplex-size-dists/music-rap-genius-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.jld2 b/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.jld2
new file mode 100644
index 0000000..89b829f
Binary files /dev/null and b/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.mat b/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.mat
deleted file mode 100644
index b1a0d10..0000000
Binary files a/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/tags-math-sx-simplex-size-dist.jld2 b/output/simplex-size-dists/tags-math-sx-simplex-size-dist.jld2
new file mode 100644
index 0000000..350a712
Binary files /dev/null and b/output/simplex-size-dists/tags-math-sx-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/tags-math-sx-simplex-size-dist.mat b/output/simplex-size-dists/tags-math-sx-simplex-size-dist.mat
deleted file mode 100644
index bfdeed8..0000000
Binary files a/output/simplex-size-dists/tags-math-sx-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.jld2 b/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.jld2
new file mode 100644
index 0000000..5f09ab4
Binary files /dev/null and b/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.mat b/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.mat
deleted file mode 100644
index e30ccd2..0000000
Binary files a/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.jld2 b/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.jld2
new file mode 100644
index 0000000..1652187
Binary files /dev/null and b/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.mat b/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.mat
deleted file mode 100644
index 66cf5bd..0000000
Binary files a/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/threads-math-sx-simplex-size-dist.jld2 b/output/simplex-size-dists/threads-math-sx-simplex-size-dist.jld2
new file mode 100644
index 0000000..20b6881
Binary files /dev/null and b/output/simplex-size-dists/threads-math-sx-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/threads-math-sx-simplex-size-dist.mat b/output/simplex-size-dists/threads-math-sx-simplex-size-dist.mat
deleted file mode 100644
index 80381f6..0000000
Binary files a/output/simplex-size-dists/threads-math-sx-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.jld2 b/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.jld2
new file mode 100644
index 0000000..bfd0bbb
Binary files /dev/null and b/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.jld2 differ
diff --git a/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.mat b/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.mat
deleted file mode 100644
index ce5b671..0000000
Binary files a/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.mat and /dev/null differ
diff --git a/output/simulation/simulation.jld2 b/output/simulation/simulation.jld2
new file mode 100644
index 0000000..7ec3457
Binary files /dev/null and b/output/simulation/simulation.jld2 differ
diff --git a/output/simulation/simulation.mat b/output/simulation/simulation.mat
deleted file mode 100644
index ff2f5e6..0000000
Binary files a/output/simulation/simulation.mat and /dev/null differ
diff --git a/output/summary-stats/coauth-MAG-geology-statistics.csv b/output/summary-stats/coauth-MAG-Geology-statistics.csv
similarity index 100%
rename from output/summary-stats/coauth-MAG-geology-statistics.csv
rename to output/summary-stats/coauth-MAG-Geology-statistics.csv
diff --git a/output/summary-stats/coauth-MAG-history-statistics.csv b/output/summary-stats/coauth-MAG-History-statistics.csv
similarity index 100%
rename from output/summary-stats/coauth-MAG-history-statistics.csv
rename to output/summary-stats/coauth-MAG-History-statistics.csv
diff --git a/paper_plots.jl b/paper_plots.jl
index 07ad5d7..7ab136f 100644
--- a/paper_plots.jl
+++ b/paper_plots.jl
@@ -1,11 +1,14 @@
 include("common.jl")
 
+using CSV
 using DataFrames
-using MAT
 using PyPlot
 using PyCall
 @pyimport matplotlib.patches as patch
 
+using ScikitLearn
+@sk_import linear_model: LogisticRegression
+
 function dataset_structure_plots()
     plot_params = all_datasets_params()
     datasets = [row[1] for row in plot_params]
@@ -17,7 +20,7 @@ function dataset_structure_plots()
     density3   = Float64[]
     ave_deg3   = Float64[]
     for dataset in datasets
-        data = readtable("output/summary-stats/$dataset-statistics.csv")
+        data = CSV.read("output/summary-stats/$dataset-statistics.csv")
         no = data[1, :nopentri]
         nc = data[1, :nclosedtri]
         push!(frac_open, no / (no + nc))
@@ -88,8 +91,8 @@ function dataset_structure_plots()
     show()
 end
 
-function simulation_plots()
-    data = matread("output/simulation/simulation.mat")
+function simulation_plot()
+    data = load("output/simulation/simulation.jld2")
     all_n         = data["n"]
     all_b         = data["b"]
     all_density   = data["density"]
@@ -97,61 +100,30 @@ function simulation_plots()
     all_frac_open = data["frac_open"]
     
     close()
-
+    figure()
     # Edge density
-    subplot(221)
     for (n, cm, marker) in [(200, ColorMap("Purples"), "d"),
                             (100, ColorMap("Reds"),    "<"),
                             (50,  ColorMap("Greens"),  "s"),
                             (25,  ColorMap("Blues"),   "o"),
                             ]
-        inds = find(all_n .== n)
+        inds = findall(all_n .== n)
         curr_b    = all_b[inds]
         density   = all_density[inds]
         frac_open = all_frac_open[inds]
-        scatter(density, frac_open, c=curr_b, marker=marker, label="$n", s=6,
-                vmin=minimum(curr_b) - 0.5, vmax=maximum(curr_b) + 0.5, cmap=cm,)
-    end
-    ax = gca()
-    ax[:set_xscale]("log")
-    fsz = 10
-    xlabel("Edge density in projected graph", fontsize=fsz)
-    ylabel("Fraction of triangles open", fontsize=fsz)
-    title("Exactly 3 nodes per simplex (simulated)", fontsize=fsz)
-
-    # Average degree
-    subplot(223)
-    for (n, cm, marker) in [(200, ColorMap("Purples"), "d"),
-                            (100, ColorMap("Reds"),    "<"),
-                            (50,  ColorMap("Greens"),  "s"),
-                            (25,  ColorMap("Blues"),   "o"),
-                            ]
-        inds = find(all_n .== n)
-        curr_b    = all_b[inds]
-        ave_deg   = all_ave_deg[inds]
-        frac_open = all_frac_open[inds]
-        scatter(ave_deg, frac_open, c=curr_b, marker=marker, label="$n", s=6,
+        scatter(density, frac_open, c=curr_b, marker=marker, label="$n", s=14,
                 vmin=minimum(curr_b) - 0.5, vmax=maximum(curr_b) + 0.5, cmap=cm)
+        
     end
     ax = gca()
     ax[:set_xscale]("log")
-    fsz = 10
-    xlabel("Average degree in projected graph", fontsize=fsz)
+    fsz = 20
+    ax[:tick_params]("both", labelsize=fsz-5, length=5, width=1)
+    legend()
+    xlabel("Edge density in projected graph", fontsize=fsz)
     ylabel("Fraction of triangles open", fontsize=fsz)
-    title("Exactly 3 nodes per simplex (simulated)", fontsize=fsz)    
-
-    # legend
-    subplot(224)
-    for (n, color, marker) in [(200, "purple", "d"),
-                               (100, "red",    "<"),
-                               (50,  "green",  "s"),
-                               (25,  "blue",   "o"),
-                               ]
-        scatter([1], [1], marker=marker, color=color, label="n = $n")
-    end
-    legend()    
     tight_layout()
-    savefig("simulation.pdf")
+    savefig("simulation.pdf", bbox_inches="tight")
     show()
 end
 
@@ -165,14 +137,16 @@ function simplex_size_dist_plot()
     subplot(221)
     for i in 1:length(datasets)
         dataset = datasets[i]
-        data = matread("output/simplex-size-dists/$dataset-simplex-size-dist.mat")
-        nvert = data["nvert"]
-        counts = data["counts"]
-        tot = sum(counts)
-        fracs = [count / tot for count in counts]
-        ms = (length(dataset) > 8 && dataset[1:8] == "congress") ? 6 : 2
-        loglog(nvert, fracs, marker=markers[i], color=colors[i],
-               linewidth=0.5, markersize=ms)
+        if dataset != "congress-committees" && dataset != "music-rap-genius"        
+            data = load("output/simplex-size-dists/$dataset-simplex-size-dist.jld2")
+            nvert = data["nvert"]
+            counts = data["counts"]
+            tot = sum(counts)
+            fracs = [count / tot for count in counts]
+            ms = 4
+            loglog(nvert, fracs, marker=markers[i], color=colors[i],
+                   linewidth=0.5, markersize=ms)
+        end
     end
     fsz = 10
     xlabel("Number of nodes in simplex", fontsize=fsz)
@@ -195,16 +169,16 @@ function min_max_val(probs1::Vector{Float64}, probs2::Vector{Float64})
     return (minimum([p for p in probs if p > 0]), maximum(probs))
 end
 
-function closure_probs_heat_map(simplex_size::Int64)
+function closure_probs_heat_map(simplex_size::Int64, initial_cutoff::Int64=100)
     plot_params = all_datasets_params()
     datasets = [param[1] for param in plot_params]
 
-    keys, nsamples, nclosed = read_closure_stats(datasets[1], simplex_size)
+    keys, nsamples, nclosed = read_closure_stats(datasets[1], simplex_size, initial_cutoff)
     probs = nclosed ./ nsamples
     P = zeros(length(datasets), length(keys))
     insufficient_sample_inds = []
     for (ind, dataset) in enumerate(datasets)
-        keys, nsamples, nclosed = read_closure_stats(dataset, simplex_size)
+        keys, nsamples, nclosed = read_closure_stats(dataset, simplex_size, initial_cutoff)
         P[ind, :] = nclosed ./ nsamples
         for (key_ind, (key, nsamp)) in enumerate(zip(keys, nsamples))
             if nsamp <= 20
@@ -217,7 +191,7 @@ function closure_probs_heat_map(simplex_size::Int64)
     PyPlot.pygui(true)
 
     minval = max(1e-9, minimum([v for v in P[:] if v > 0]))
-    P[P[:] .== 0] = minval
+    P[P[:] .== 0] .= minval
     for (i, j) in insufficient_sample_inds; P[i, j] = 0; end
 
     cm = ColorMap("Blues")
@@ -234,16 +208,14 @@ function closure_probs_heat_map(simplex_size::Int64)
                                        facecolor=gray))
     end
     ax[:set_yticks](0:(length(datasets)-1))
-    #ax[:set_yticklabels](datasets, rotation=10, fontsize=(simplex_size == 4 ? 4 : 5))
     ax[:set_yticklabels](datasets, rotation=10, fontsize=(simplex_size == 4 ? 4 : 7))
     ax[:set_xticks](0:(length(probs)-1))
     ax[:tick_params](axis="both", length=3)
     ax[:set_xticklabels](["" for _ in 0:(length(probs)-1)])
-    #cb = colorbar(orientation="horizontal")
     cb = colorbar()
-    cb[:ax][:tick_params](labelsize=(simplex_size == 4 ? 18 : 9))
+    cb[:ax][:tick_params](labelsize=9)
     tight_layout()
-    savefig("closure-probs-$(simplex_size).pdf")
+    savefig("closure-probs-$(simplex_size)-$(initial_cutoff).pdf")
     show()
 end
 
@@ -347,7 +319,7 @@ function four_node_scatter_plot()
     minval, maxval = min_max_val(probs0111, probs22)
     loglog([minval, maxval], [minval, maxval], "black", lw=0.5)
     for i in 1:length(datasets)
-        loglog(probs0111[i], probs22[i], markers[i], color=colors[i])
+        loglog(probs22[i], probs0111[i], markers[i], color=colors[i])
     end
     xlabel("Closure probability (0111)", fontsize=fsz)
     ylabel("Closure probability (22)", fontsize=fsz)    
@@ -368,7 +340,7 @@ function generalized_means_plot()
             dataset = param[1]
             if dataset in datasets
                 basename = "output/generalized-means/$dataset-open-tris-80-100"
-                data = matread("$basename-genmeans-perf.mat")
+                data = load("$basename-genmeans-perf.jld2")
                 ps = data["ps"]
                 improvements = data["improvements"]
                 plot(ps[2:end-1], improvements[2:end-1],
@@ -381,13 +353,13 @@ function generalized_means_plot()
         ax[:set_xticks](-4:1:4)
         ax[:tick_params](axis="both", length=3)
     end
-        
+
     set1 = ["threads-stack-overflow", "threads-math-sx", "threads-ask-ubuntu"]
-    set2 = ["tags-stack-overflow", "tags-math-sx", "tags-ask-ubuntu", "music-rap-genius",
+    set2 = ["tags-stack-overflow", "tags-math-sx", "tags-ask-ubuntu",
             "contact-high-school", "contact-primary-school",
             "DAWN", "NDC-substances", "NDC-classes"]
     set3 = ["coauth-MAG-History", "coauth-MAG-Geology", "coauth-DBLP",
-            "email-Enron", "email-Eu", "congress-committees", "congress-bills"]
+            "email-Enron", "email-Eu", "congress-bills"]
     subplot(221)
     make_subplot(set1)
     legend(fontsize=fsz)
@@ -395,9 +367,72 @@ function generalized_means_plot()
     make_subplot(set2)    
     subplot(223)
     make_subplot(set3)    
-    
     tight_layout()
     savefig("generalized-means-perf.pdf")
     show()
 end
 
+function logreg_decision_boundary(trial::Int64=1)
+    (X, _, y, _, yf, _) = egonet_train_test_data(trial)
+    X = X[:, [LOG_AVE_DEG, FRAC_OPEN]]
+    model = LogisticRegression(fit_intercept=true, multi_class="multinomial", C=10,
+                               solver="newton-cg", max_iter=1000)
+    ScikitLearn.fit!(model, X, y)
+
+    dim = 500
+    minval1, maxval1 = minimum(X[:, 1]) - 0.5, maximum(X[:, 1]) * 1.02
+    minval2, maxval2 = minimum(X[:, 2]) - 0.01, maximum(X[:, 2]) + 0.05
+    grid_feats = zeros(Float64, 2, dim * dim)
+    grid_ind = 1
+    xx = [(i - 1) * (maxval1 - minval1) / dim + minval1 for i in 1:dim]
+    yy = [(j - 1) * (maxval2 - minval2) / dim + minval2 for j in 1:dim]
+    for x in xx, y in yy
+        grid_feats[1, grid_ind] = x
+        grid_feats[2, grid_ind] = y
+        grid_ind += 1
+    end
+
+    close()
+    figure()
+    Z = reshape(ScikitLearn.predict(model, Matrix(grid_feats')), dim, dim)
+    labels = Dict(0 => "coauthorship", 1 => "tags", 2 => "threads",
+                  3 => "contact",      4 => "email")
+    greys = ["#f7f7f7", "#d9d9d9", "#bdbdbd", "#969696", "#636363"]
+    contourf(exp.(xx), yy, Z, colors=greys)
+    params = all_datasets_params()
+    label2domain = Dict(0  => 0,  1  => 0,  2  => 0,
+                        3  => -1,
+                        4  => 1,  5  => 1,  6  => 1,
+                        7  => 2,  8  => 2,  9  => 2,
+                        10 => -1, 11 => -1, 12 => -1, 13 => -1, 14 => -1,
+                        15 => 3,  16 => 3,
+                        17 => 4,  18 => 4)
+    colors_full = ["#ed5e5f", "#e41a1c", "#9f1214", 
+                   "no-op",
+                   "#69a3d2", "#377eb8", "#25567d", 
+                   "#80c87d", "#4daf4a", "#357933", 
+                   "no-op", "no-op", "no-op", "no-op", "no-op",
+                   "#984ea3", "#68356f",
+                   "#d37a48", "#a65628"]
+    for label in sort(unique(yf))
+        inds = findall(yf .== label)
+        scatter(exp.(X[inds, 1]), X[inds, 2],
+                color=colors_full[label],
+                marker="o",
+                label=params[label][1],
+                s=14)
+    end
+    fsz = 18
+    legend(fontsize=fsz-4)
+    ax = gca()
+    ax[:set_xscale]("log")
+    xlabel("Average degree", fontsize=fsz)
+    ylabel("Fraction of triangles open", fontsize=fsz)
+    title("Decision boundary", fontsize=fsz)
+    ax[:set_xlim](1.8, 400)
+    ax[:tick_params](axis="both", length=3, labelsize=14)
+    tight_layout()
+    savefig("decision.pdf")
+    show()
+end
+;
diff --git a/precompute_genmeans.jl b/precompute_genmeans.jl
new file mode 100644
index 0000000..a3c24cc
--- /dev/null
+++ b/precompute_genmeans.jl
@@ -0,0 +1,13 @@
+include("open_triangle_prediction.jl")
+
+function main()
+    datasets = [row[1] for row in all_datasets_params()]
+    for dataset in datasets
+        println("$dataset...")
+        if dataset != "congress-committees" && dataset != "music-rap-genius"
+            hd = read_txt_data(dataset)
+            collect_labeled_dataset(hd)
+            collect_generalized_means(hd)
+        end
+    end
+end
diff --git a/precompute_simplex_sizes.jl b/precompute_simplex_sizes.jl
new file mode 100644
index 0000000..8347c95
--- /dev/null
+++ b/precompute_simplex_sizes.jl
@@ -0,0 +1,21 @@
+include("common.jl")
+
+function main()
+    datasets = [row[1] for row in all_datasets_params()]
+    for dataset in datasets
+        println("$dataset...")
+        if dataset != "congress-committees" && dataset != "music-rap-genius"
+            hd = read_txt_data(dataset)
+            nv_counts = Dict{Int64,Int64}()
+            for v in hd.nverts
+                if !haskey(nv_counts, v); nv_counts[v]  = 1
+                else                      nv_counts[v] += 1
+                end
+            end
+            nverts = [x[1] for x in nv_counts]
+            counts = [x[2] for x in nv_counts]
+            save("output/simplex-size-dists/$dataset-simplex-size-dist.jld2",
+                 Dict("nvert" => nverts, "counts" => counts))
+        end
+    end
+end
diff --git a/simulations.jl b/simulations.jl
index f610803..aea0d11 100644
--- a/simulations.jl
+++ b/simulations.jl
@@ -1,8 +1,7 @@
 include("common.jl")
 
-using Combinatorics
 using Distributions
-using MAT
+using SparseArrays
 
 function simulate_summary_stats(n::Int64, p::Float64)
     bin = Binomial(binomial(n, 3), p)
@@ -47,7 +46,7 @@ function simulate()
             end
         end
     end
-    matwrite("simulation.mat",
-             Dict("n" => all_n, "b" => all_b, "density" => all_density,
-                  "ave_deg" => all_ave_deg, "frac_open" => all_frac_open))
+    save("output/simulation/simulation.jld2",
+         Dict("n" => all_n, "b" => all_b, "density" => all_density,
+              "ave_deg" => all_ave_deg, "frac_open" => all_frac_open))
 end
diff --git a/statistical_tests_and_models.jl b/statistical_tests_and_models.jl
index 55aaf5f..c22586b 100644
--- a/statistical_tests_and_models.jl
+++ b/statistical_tests_and_models.jl
@@ -1,6 +1,9 @@
 include("common.jl")
+
+using CSV
 using HypothesisTests
 using DataFrames
+using Printf
 using GLM
 
 function simplicial_closure_tests(significance::Float64=1e-5, X::Int64=100, only_3_node::Bool=false)
@@ -63,11 +66,11 @@ function simplicial_closure_tests(significance::Float64=1e-5, X::Int64=100, only
                 total += 1
             end
         end
-        println(@sprintf("%s (left): %d of %d tests significant at < %g",
-                         test_type, sig_count1, total, significance))
-        println(@sprintf("%s (right): %d of %d tests significant at < %g",
-                         test_type, sig_count2, total, significance))
-        println(@sprintf("%s (raw): %d of %d", test_type, raw, total))
+        @printf("%s (left): %d of %d tests significant at < %g\n",
+                test_type, sig_count1, total, significance)
+        @printf("%s (right): %d of %d tests significant at < %g\n",
+                test_type, sig_count2, total, significance)
+        @printf("%s (raw): %d of %d\n", test_type, raw, total)
     end
 end
 
@@ -78,7 +81,7 @@ function fracopen_logavedeg_linear_models()
     frac_open3 = Float64[]
     ave_deg3   = Float64[]
     for dataset in datasets
-        data = readtable("output/summary-stats/$dataset-statistics.csv")
+        data = CSV.read("output/summary-stats/$dataset-statistics.csv")
         no = data[1, :nopentri]
         nc = data[1, :nclosedtri]
         push!(frac_open, no / (no + nc))
@@ -102,3 +105,4 @@ function fracopen_logavedeg_linear_models()
     model3 = lm(@formula(Y ~ X), data3)
     return (model, model3)
 end
+;