diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e4e5f6c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*~ \ No newline at end of file diff --git a/README.md b/README.md index cc738b9..6887ba2 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ This Julia software accompanies the following paper: -- [Simplicial closure and higher-order link prediction](https://arxiv.org/abs/1802.06916). +- [Simplicial closure and higher-order link prediction](http://www.pnas.org/content/early/2018/11/08/1800683115). Austin R. Benson, Rediet Abebe, Michael T. Schaub, Ali Jadbabaie, and Jon Kleinberg. - arXiv:1802.06916, 2018. + *Proceedings of the National Academy of Sciences*, 2018. This tutorial code is not the main software library for simplicial closure and higher-order link prediction, which is [ScHoLP.jl](https://github.com/arbenson/ScHoLP.jl). Instead, the tutorial has the following goals: @@ -16,9 +16,9 @@ This tutorial code is not the main software library for simplicial closure and h As discussed above, this tutorial shows how to use the ScHoLP.jl library for higher-order network analysis and reproduction of results. To get the ScHoLP.jl library and start using it in Julia: ```julia -Pkg.clone("https://github.com/arbenson/ScHoLP.jl.git") +import Pkg +Pkg.add("ScHoLP") Pkg.test("ScHoLP") -using ScHoLP ``` Note that ScHoLP.jl has thread-level parallelism available for many features (using Julia's Base.Threads). @@ -30,6 +30,21 @@ git clone https://github.com/arbenson/ScHoLP-Tutorial.git cd ScHoLP-Tutorial ``` +To run this entire tutorial, you will also need several Julia packages (not all packages are needed for each component; you can add them as necessary). + +```julia +import Pkg +Pkg.add("CSV") +Pkg.add("DataFrames") +Pkg.add("Distributions") +Pkg.add("FileIO") +Pkg.add("GLM") +Pkg.add("HypothesisTests") +Pkg.add("JLD2") +Pkg.add("PyCall") +Pkg.add("ScikitLearn") +``` + ### Data The package comes with a few example datasets. @@ -37,7 +52,7 @@ The package comes with a few example datasets. ```julia using ScHoLP ex = example_dataset("example1") # example from figure 1 of paper -typeof(ex) # should be ScHoLP.HONData +typeof(ex) # should be HONData ex.simplices, ex.nverts, ex.times, ex.name # components of the data structure chs = example_dataset("contact-high-school") # another dataset ``` @@ -52,7 +67,15 @@ ndc_classes = read_txt_data("NDC-classes") enron = read_txt_data("email-Enron") ``` -The collection of datasets from the paper are available from [this web site](http://www.cs.cornell.edu/~arb/data/). +The collection of datasets from the paper are available from [this web site](http://www.cs.cornell.edu/~arb/data/). You can also download them wholesale and use them as follows. + +```bash +cd ScHoLP-Tutorial/data +wget https://github.com/arbenson/ScHoLP-Data/archive/1.0.tar.gz +tar -xzvf 1.0.tar.gz +gunzip ScHoLP-Data-1.0/*/*.gz +mv ScHoLP-Data/* . +``` ### Simplicial closures @@ -145,20 +168,14 @@ Now we can generate scores of the open triangles from the first 80% of the datas ```julia collect_local_scores(enron) # scores based on local structural features collect_walk_scores(enron) # scores based on random walks and paths -collect_Simplicial_PPR_combined_scores(enron) # scores based on Simplicial PPR -collect_logreg_supervised_scores(enron) # scores based on logistic regression supervised method -``` - -Since enron is a small dataset, we can afford to decompose the Simplicial PPR scores into the gradient, curl, and harmonic components: - -```julia -collect_Simplicial_PPR_decomposed_scores(enron) +collect_logreg_supervised_scores(enron) # scores based on logistic regression +collect_Simplicial_PPR_decomposed_scores(enron) # scores based on Simplicial PPR ``` We can evaluate how well these methods do compared to random guessing with respect to area under the precision-recall curve. This should reproduce the line for the email-Enron dataset in Table 2 of the paper. ```julia -evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "common", "jaccard", "adamic_adar", "proj_graph_PA", "simplex_PA", "UPKatz", "WPKatz", "UPPR", "WPPR", "SimpPPR_comb", "logreg_supervised", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl"]) +evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "common", "jaccard", "adamic_adar", "proj_graph_PA", "simplex_PA", "UPKatz", "WPKatz", "UPPR", "WPPR", "SimpPPR_comb", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl", "logreg_supervised"]) ``` We can also look at the top predictions made by the algorithms. @@ -170,21 +187,21 @@ top_predictions(enron, "UPPR", 12) This should produce the following output ``` -1 (0.304908; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; larry.campbell@enron.com -2 (0.272448; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com -3 (0.253939; 0): larry.campbell@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com -4 (0.189741; 0): joe.parks@enron.com; eric.bass@enron.com; dan.hyvl@enron.com -5 (0.181000; 1): lisa.gang@enron.com; kate.symes@enron.com; bill.williams@enron.com -6 (0.179424; 0): joe.quenet@enron.com; chris.dorland@enron.com; jeff.king@enron.com -7 (0.176207; 0): joe.quenet@enron.com; jeff.king@enron.com; fletcher.sturm@enron.com -8 (0.175591; 1): lisa.gang@enron.com; holden.salisbury@enron.com; kate.symes@enron.com -9 (0.173161; 1): lisa.gang@enron.com; holden.salisbury@enron.com; bill.williams@enron.com -10 (0.170872; 0): geir.solberg@enron.com; holden.salisbury@enron.com; kate.symes@enron.com +1 (0.304992; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; larry.campbell@enron.com +2 (0.272495; 0): joe.stepenovitch@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com +3 (0.253992; 0): larry.campbell@enron.com; don.baughman@enron.com; benjamin.rogers@enron.com +4 (0.189678; 0): joe.parks@enron.com; eric.bass@enron.com; dan.hyvl@enron.com +5 (0.181085; 1): lisa.gang@enron.com; kate.symes@enron.com; bill.williams@enron.com +6 (0.179377; 0): joe.quenet@enron.com; chris.dorland@enron.com; jeff.king@enron.com +7 (0.176236; 0): joe.quenet@enron.com; jeff.king@enron.com; fletcher.sturm@enron.com +8 (0.175624; 1): lisa.gang@enron.com; holden.salisbury@enron.com; kate.symes@enron.com +9 (0.173160; 1): lisa.gang@enron.com; holden.salisbury@enron.com; bill.williams@enron.com +10 (0.170947; 0): geir.solberg@enron.com; holden.salisbury@enron.com; kate.symes@enron.com 11 (0.164845; 0): geir.solberg@enron.com; holden.salisbury@enron.com; bill.williams@enron.com -12 (0.162414; 0): lisa.gang@enron.com; cara.semperger@enron.com; kate.symes@enron.com +12 (0.162391; 0): lisa.gang@enron.com; cara.semperger@enron.com; kate.symes@enron.com ``` -These are the top 12 predictions for the unweighted personalized PageRank scores. The tuple next to the ordered numbers, e.g., (0.304908; 0) in the first line, gives the score function value and a 0/1 indicator of whether or not the open triangle closed in the final 20% of the dataset (1 means that it closed). Here, we see that the triples of nodes with the 5th, 8th, and 9th highest scores simplicially closed. +These are the top 12 predictions for the unweighted personalized PageRank scores. The tuple next to the ordered numbers, e.g., (0.304992; 0) in the first line, gives the score function value and a 0/1 indicator of whether or not the open triangle closed in the final 20% of the dataset (1 means that it closed). Here, we see that the triples of nodes with the 5th, 8th, and 9th highest scores went through a simplicial closure event. ### Summary statistics @@ -192,8 +209,10 @@ There is some basic functionality for gathering summary statistics about the dat ```julia chs = example_dataset("contact-high-school") -basic_summary_statistics(chs) # prints basic summary statistics (same as Table 1 in paper) -summary_statistics(chs) # more advanced statistics --> contact-high-school-statistics.csv +# print basic summary statistics (same as Table 1 in paper) +basic_summary_statistics(chs) +# compute more advanced statistics --> contact-high-school-statistics.csv +summary_statistics(chs); ``` The last command writes several summary statistics to a csv file. For example, "meansimpsize" is the mean number of nodes in each simplex, "projdensity" is the edge density of the projected graph, and "nclosedtri" and "nopentri" are the number of closed and open triangles. The first line of the csv file are the variables, the first line are the statistics for the full dataset and the second line are the statistics for the dataset restricted to only 3-node simplices. @@ -205,25 +224,25 @@ contact-high-school,327,172035,352718,2,5,32.644895,1.091537e-01,2370,31850,7937 contact-high-school-3-3,317,7475,22425,3,3,8.305556,5.390728e-02,2091,5721,2126,6378,3.000000,2.362222,1.132810,1.118476e-01,2094,18139 ``` -### Reproducing results +### Reproducing results in the main text This section shows how to reproduce results from the paper. -##### Linear models for relationships in Figures 2D and 2F +##### Linear models for relationships in Figures 2D and 2E We create linear models for the fraction of triangles in terms of the covariate log average degree (plus an intercept term). The following code snippet produces these models. ```julia # starting from the main directory of tutorial code include("statistical_tests_and_models.jl") -model_fig_2d, model_fig_2f = fracopen_logavedeg_linear_models(); -r2(model_fig_2d) # roughly 0.38 -r2(model_fig_2f) # roughly 0.85 +model_fig_2D, model_fig_2E = fracopen_logavedeg_linear_models(); +r2(model_fig_2D) # roughly 0.38 +r2(model_fig_2E) # roughly 0.85 ``` -##### Hypothesis tests for strong wedge vs. weak open triangle and strong flap vs. weak open wireframe +##### Hypothesis tests for fewer strong ties vs. more weaker ties -Here we are testing hypotheses on whether stronger but fewer ties (strong wedge and flap) or weaker but more ties (weak open triangle and wireframe) are more indicative of simplicial closure. +Here we are testing hypotheses on whether stronger but fewer ties or weaker but more ties are more indicative of simplicial closure. ```julia # starting from the main directory of tutorial code @@ -237,16 +256,35 @@ simplicial_closure_tests(1e-3) We saw how to get these numbers in the summary statistics section above. The `basic_summary_statistics()` function produces the numbers. -##### Table 2 (Higher-order link prediction performance) +##### Table 2 (logistic regression for system domain classification) + +Egonet data was collected with the function call `collect_egonet_data(100, 20)` in the file `egonet_analysis.jl`. This takes some time, so we pre-computed the data output and stored it in the directory `output/egonets`. We can reproduce the performance of the logistic regression models with the following code snippet. + +```julia +include("egonet_analysis.jl") +egonet_predict([LOG_DENSITY, LOG_AVE_DEG, FRAC_OPEN]) +egonet_predict([LOG_AVE_DEG, FRAC_OPEN]) +egonet_predict([LOG_DENSITY, FRAC_OPEN]) +egonet_predict([LOG_DENSITY, LOG_AVE_DEG]) +``` + +##### Table 3 (Higher-order link prediction performance) The numbers in this table came from using the higher-order link prediction methods outlined above. Note that some of the score functions are computationally expensive. The necessary julia functions are - `collect_labeled_dataset()` to generate the labeled dataset based on an 80/20 split of the data - `collect_local_scores()` to generate scores based on local structural features - `collect_walk_scores() ` to generate scores based on random walks and paths -- `collect_Simplicial_PPR_combined_scores()` to generate scores based on simplicial PPR - `collect_logreg_supervised_scores()` to generate scores from the supervised learning method +After collecting the data, we can reproduce results in the table with the following commands. + +```julia +include("open_triangle_prediction.jl") +enron = example_dataset("email-Enron") +evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "adamic_adar", "proj_graph_PA", "UPKatz", "UPPR", "logreg_supervised"]) +``` + ##### Figure 1 (small example of higher-order network) The example higher-order network in Figure 1 is one of the examples included with the library. Here we show how to list the simplices and compute the weighted projected graph. @@ -256,18 +294,21 @@ using ScHoLP ex_fig1 = example_dataset("example1") # Print out simplices -ind = 1 -for (nv, t) in zip(ex_fig1.nverts, ex_fig1.times) - simplex = ex_fig1.simplices[ind:(ind + nv - 1)] - ind += nv - println("$t $simplex") +function print_simplices() + ind = 1 + for (nv, t) in zip(ex_fig1.nverts, ex_fig1.times) + simplex = ex_fig1.simplices[ind:(ind + nv - 1)] + ind += nv + println("$t $simplex") + end end +print_simplices() # Get the weighted projected graph basic_matrices(ex_fig1)[3] ``` -##### Figure 2A—B (legend and simplex size distribution) +##### Figure 2A (simplex size distribution) Here is a sample code snippet for computing the simplex size distribution for the email-Enron dataset. @@ -284,88 +325,89 @@ for (nv, f) in zip(num_verts, fracs) end ``` -For reproducing the figure, we have pre-computed the distributions in the files `output/simplex-size-dists/*-simplex-size-dist.mat`. The following produces the simplex size distribution and saves the figure. +For reproducing the figure, we have pre-computed the distributions in the files `output/simplex-size-dists/*-simplex-size-dist.jld2`. The following produces the simplex size distribution and saves the figure. ```julia # starting from the main directory of tutorial code include("paper_plots.jl") -simplex_size_dist_plot() # produce figures 2AB --> simplex-size-dist.pdf +# produce figure 2A --> simplex-size-dist.pdf +simplex_size_dist_plot() ``` -##### Figure 2C—F (basic dataset structure) +##### Figure 2B—E (basic dataset structure) These figures rely on using the `summary_statistics()` function for all of the datasets. For some of the larger datasets, this can take a while. For this tutorial, we include the pre-computed statistics are in the `output/summary-stats/` directory. The following code snippet reproduces the figure. ```julia # starting from the main directory of tutorial code include("paper_plots.jl") -dataset_structure_plots() # produce figures 2CDEF --> dataset-structure.pdf +# produce figures 2BCDE --> dataset-structure.pdf +dataset_structure_plots() +``` + +##### Figure 3 (logistic regression decision boundary) + +Plot the decision boundary for the logistic regression classifier. + +```julia +include("paper_plots.jl") +logreg_decision_boundary() ``` -##### Figure 2G—H (model simulation) +##### Figure 4 (model simulation) -These figures require running simulations. Since the simulations are random, the output may not be exactly the same. The following will re-run the simulations and write the results to `simulation.mat`. +These figures require running simulations. Since the simulations are random, the output may not be exactly the same. The following will re-run the simulations and write the results to `simulation.jld2`. ```julia # starting from the main directory of tutorial code include("simulations.jl") -simulate() # run the simulations (takes several minutes) --> simulation.mat +# run the simulations (takes several minutes) +simulate() # --> stores in output/simulation/simulation.jld2 ``` -The simulation results used in the paper are stored in `output/simulation/simulation.mat` for convenience. The above code should produce something similar but not exactly the same (due to randomness in the simulation). The following code snippet reproduces figures 2GH. +The simulation results used in the paper are stored in `output/simulation/simulation.jld2` for convenience. The above code should produce something similar but not exactly the same (due to randomness in the simulation). The following code snippet reproduces figures 2GH. ```julia # starting from the main directory of tutorial code include("paper_plots.jl") -simulation_plots() # reproduce figures 2GH --> simulation.pdf +simulation_plot() # reproduce Figure 4 --> simulation.pdf ``` -##### Figure 3 (lifecycles) +##### Figure 5 (lifecycles) -Producing results from Figure 3A uses the `process_lifecycles()` function from the ScHoLP.jl library. Figures 3B—D use the `lifecycle()` function in `lifecycle_analysis.jl`. +Producing results from Figure 5A uses the `process_lifecycles()` function from the ScHoLP.jl library. Figures 5B—D use the `lifecycle()` function in`lifecycle_analysis.jl`. ```julia -# starting from the main directory of tutorial code include("lifecycle_analysis.jl") -hist = read_txt_data("coauth-MAG-History") # read dataset from data/coauth-MAG-History directory -# Get data from Figure 3A +# read dataset from data/coauth-MAG-History directory +hist = read_txt_data("coauth-MAG-History") +# Get data for Figure 5A (this may take a couple minutes) closed_transition_counts, open_transition_counts = process_lifecycles(hist) # direct transitions to simplicial closure from each state closed_transition_counts[end, 1:(end-1)] - -ndc_classes = read_txt_data("NDC-classes") # read data from data/NDC-classes directory +# read data from data/NDC-classes directory +ndc_classes = read_txt_data("NDC-classes") node_labels = read_node_labels("NDC-classes") -node_labels[[44, 74, 76]] # nodes in Figure 3B +node_labels[[44, 74, 76]] # nodes in Figure 5B lifecycle(ndc_classes, 44, 74, 76) ``` The simplex labels in the last function call are NDC codes. For example, the first one is 67296-1236. This corresponds to Reyataz as produced by Redpharm Drug, Inc. in 2003, as recorded [here](https://ndclist.com/ndc/67296-1236/package/67296-1236-4). -##### Figure 4 (3-node configuration closure probabilities) +##### Figure 6 (3-node and 4-node configuration closure probabilities) This figure is constructed from the simplicial closure probabilities on 3-node configurations. Above, we showed how to compute these. We have pre-computed the probabilities for each dataset in the directory `output/3-node-closures/`. ```julia # starting from the main directory of tutorial code include("paper_plots.jl") -closure_probs_heat_map(3) # Figure 4A --> closure-probs-scatter-3.pdf -three_node_scatter_plot() # Figures 4BCD +three_node_scatter_plot() # Figures 6ABC --> closure-prob-scatter-3.pdf +four_node_scatter_plot() # Figures 6DEF --> closure-prob-scatter-4.pdf ``` -##### Figure 5 (4-node configuration closure probabilities) +##### Figure 7 (generalized means) -Similar to Figure 4, this figure is constructed from simplicial closure probabilities on 4-node configurations. We showed above how to compute these. We have pre-computed the probabilities for each dataset in `output/*-4-node-closures.txt`. - -```julia -# starting from the main directory of tutorial code -include("paper_plots.jl") -closure_probs_heat_map(4) # Figure 5A --> closure-probs-4.pdf -four_node_scatter_plot() # Figures 5BCD --> closure-prob-scatter-4.pdf -``` - -##### Figure 6 (generalized means) - -We first show how to collect the data for generalized means. The following code snippet should produce an output file `prediction-output/email-Enron-open-tris-80-100-genmeans-perf.mat`. +We first show how to collect the data for generalized means. The following code snippet should produce an output file `prediction-output/email-Enron-open-tris-80-100-genmeans-perf.jld2`. ```julia # starting from the main directory of tutorial code @@ -380,10 +422,14 @@ We pre-computed the generalized mean scores for all of the datasets in the paper ```julia # starting from the main directory of tutorial code include("paper_plots.jl") -generalized_means_plot() # Figure 6 --> generalized-means-perf.pdf +generalized_means_plot() # --> generalized-means-perf.pdf ``` -##### Table 3 (temporal asynchroncity) + + +### Reproduce results in the supplementary material + +##### Table S1 (temporal asynchroncity) To measure temporal asynchroncity in the datasets, we look at the number of "active interval" overlaps in the open triangles. The active interval is the time interval corresponding to the interval of time between the first and last simplices (in time) containing the two nodes. @@ -400,7 +446,7 @@ dataset & # open triangles & 0 overlaps & 1 overlap & 2 overlaps & 3 overlaps email-Enron & 3317 & 0.008 & 0.130 & 0.151 & 0.711 ``` -##### Table 4 (dependence of tie strength and edge density at different points in time) +##### Table S2 (dependence of tie strength and edge density at different points in time) The results from this table just use the core ScHoLP.jl functionality and the same function we saw above for the simplicial closure probabilities. We just provide an extra input parameter to the function `closure_type_counts3()` for pre-filtering the dataset to just start with the first X% of timestamped simplices. @@ -423,7 +469,7 @@ for X in [40, 60, 80, 100] end ``` -##### Table 5 (Simplicial closure probabilities at different points in time) +##### Table S3 (Simplicial closure probabilities at different points in time) In describing how to reproduce Table S2, we showed how to get the closure probabilities at different points in time. The following code snippet prints out some of the statistics for other datasets, which are pre-computed and stored in the `output/3-node-closures/` directory. @@ -445,23 +491,74 @@ closure_stats_over_time("DAWN") closure_stats_over_time("tags-stack-overflow") ``` -##### Table 6 (4-node configuration reference figures) +##### Table S5 (extra results from models) + +Example to get all of the results form the Enron dataset. + +```julia +include("open_triangle_prediction.jl") +enron = read_txt_data("email-Enron") # read from data/email-Enron directory +collect_labeled_dataset(enron) +collect_local_scores(enron) # scores based on local structural features +collect_walk_scores(enron) # scores based on random walks and paths +collect_logreg_supervised_scores(enron) # scores based on logistic regression +collect_Simplicial_PPR_decomposed_scores(enron) # scores based on Simplicial PPR +evaluate(enron, ["harm_mean", "geom_mean", "arith_mean", "common", "jaccard", "adamic_adar", "proj_graph_PA", "simplex_PA", "UPKatz", "WPKatz", "UPPR", "WPPR", "SimpPPR_comb", "logreg_supervised"]) +``` + +##### Table S6 (extra results from the Hodge decomposition) -This table is just for illustration and does not present computational results. +This table shows the results from using the Hodge decomposition to further decompose the simplicial personalized PageRank scores. Note that this software uses the newer normalization method described in the following paper: -##### Table 7 (extra results from the Hodge decomposition) +- [Random walks on simplicial complexes and the normalized Hodge Laplacian](https://arxiv.org/abs/1807.05044). Michael T. Schaub, Austin R. Benson, Paul Horn, Gabor Lippner, and Ali Jadbabaie. *arXiv:1807.05044*, 2018. -This table shows the results from using the Hodge decomposition to further decompose the simplicial personalized PageRank scores. Here is how one would reproduce the line for the NDC-classes dataset (numbers may be slightly different due to randomness). +Here is how one would reproduce the line for the NDC-classes dataset. ```julia # starting from the main directory of tutorial code include("open_triangle_prediction.jl") -ndc_classes = read_txt_data("NDC-classes") # read data from data/NDC-classes directory -collect_labeled_dataset(ndc_classes) # collect the data from the 80/20 split -collect_Simplicial_PPR_decomposed_scores(ndc_classes) # collect scores -evaluate(ndc_classes, ["SimpPPR_comb", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl"]) # print relative scores +# read data from data/NDC-classes directory +ndc_classes = read_txt_data("NDC-classes") +# collect the data from the 80/20 split +collect_labeled_dataset(ndc_classes) +# collect scores +collect_Simplicial_PPR_decomposed_scores(ndc_classes) +# print relative scores +evaluate(ndc_classes, ["SimpPPR_comb", "SimpPPR_grad", "SimpPPR_harm", "SimpPPR_curl"]) +``` + +##### Table S7 (output predictions) + +We showed how to look at the top predictions in the higher-order link prediction section above. Here is the specific command to reproduce Table S7. + +```julia +include("open_triangle_prediction.jl") +dawn = read_txt_data("DAWN") # need to download DAWN data to data/ directory +collect_labeled_dataset(dawn) +collect_local_scores(dawn) +top_predictions(dawn, "adamic_adar", 25) +``` + +##### Figure S1 (heat map of 3-node closures) + +```julia +include("paper_plots.jl") +closure_probs_heat_map(3) +``` + +##### Figure S2 (heat map of 4-node closures) + +```julia +include("paper_plots.jl") +closure_probs_heat_map(3) ``` -##### Table S8 (output predictions) +##### Figure S3 (heat map of 3-node closures at different points in time) + +```julia +include("paper_plots.jl") +for X in [40, 60, 80, 100] + closure_probs_heat_map(3, X) +end +``` -We showed how to look at the top predictions in the higher-order link prediction section above. diff --git a/common.jl b/common.jl index 8213ca5..62faee9 100644 --- a/common.jl +++ b/common.jl @@ -1,10 +1,28 @@ using Base.Threads using Combinatorics +using DelimitedFiles +using FileIO +using JLD2 +using Random using ScHoLP +using SparseArrays using StatsBase +const NUM_FEATS = 3 +const LOG_AVE_DEG = 1 +const LOG_DENSITY = 2 +const FRAC_OPEN = 3 + function read_txt_data(dataset::String) - read(filename::String) = convert(Vector{Int64}, readdlm(filename, Int64)[:, 1]) + function read(filename::String) + ret = Int64[] + open(filename) do f + for line in eachline(f) + push!(ret, parse(Int64, line)) + end + end + return ret + end return HONData(read("data/$(dataset)/$(dataset)-simplices.txt"), read("data/$(dataset)/$(dataset)-nverts.txt"), read("data/$(dataset)/$(dataset)-times.txt"), @@ -40,13 +58,39 @@ function read_closure_stats(dataset::String, simplex_size::Int64, initial_cutoff end for row_ind in 1:size(data, 1) row = convert(Vector{Int64}, data[row_ind, :]) - push!(keys, (row[1:simplex_size]...)) + push!(keys, tuple(row[1:simplex_size]...)) push!(nsamples, row[end - 1]) push!(nclosed, row[end]) end return (keys, nsamples, nclosed) end +function egonet_train_test_data(trial::Int64) + Random.seed!(444) # for reproducibility + data = load("output/egonets/egonet-data-$trial.jld2") + X = data["X"] + y = data["labels"] + yf = data["full_labels"] + inds = randperm(length(y)) + X = X[inds, :] + y = y[inds] + yf = yf[inds] + + train_inds = Int64[] + test_inds = Int64[] + for label in sort(unique(y)) + inds = findall(y .== label) + end_ind = convert(Int64, round(length(inds) * 0.8)) + append!(train_inds, inds[1:end_ind]) + append!(test_inds, inds[(end_ind + 1):end]) + end + + X_train, X_test = X[train_inds, :], X[test_inds, :] + y_train, y_test = y[train_inds], y[test_inds] + yf_train, yf_test = yf[train_inds], yf[test_inds] + return (X_train, X_test, y_train, y_test, yf_train, yf_test) +end + # This is just a convenient wrapper around all of the formatting parameters for # making plots. function all_datasets_params() @@ -75,3 +119,4 @@ function all_datasets_params() ] return plot_params end +; diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..d64b504 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,14 @@ +DAWN +NDC-substances +coauth-DBLP +coauth-MAG-Geology +congress-bills +contact-high-school +contact-primary-school +email-Eu +tags-ask-ubuntu +tags-math-sx +tags-stack-overflow +threads-ask-ubuntu +threads-math-sx +threads-stack-overflow diff --git a/egonet_analysis.jl b/egonet_analysis.jl new file mode 100644 index 0000000..7fb1885 --- /dev/null +++ b/egonet_analysis.jl @@ -0,0 +1,149 @@ +include("common.jl") + +using DataFrames +using GLM +using Printf +using Random +using SparseArrays +using Statistics + +using ScikitLearn +@sk_import linear_model: LogisticRegression + +# Construct HONData for a given ego +function egonet_dataset(dataset::HONData, ego::Int64, B::SpIntMat) + in_egonet = zeros(Bool, size(B, 1)) + in_egonet[ego] = true + in_egonet[findnz(B[:, ego])[1]] .= true + + node_map = Dict{Int64, Int64}() + function get_key(x::Int64) + if haskey(node_map, x); return node_map[x]; end + n = length(node_map) + 1 + node_map[x] = n + return n + end + ego_key = get_key(ego) + + new_simplices = Int64[] + new_nverts = Int64[] + new_times = Int64[] + curr_ind = 1 + for (nvert, time) in zip(dataset.nverts, dataset.times) + end_ind = curr_ind + nvert - 1 + simplex = dataset.simplices[curr_ind:end_ind] + curr_ind += nvert + simplex_in_egonet = [v for v in simplex if in_egonet[v]] + if length(simplex_in_egonet) > 0 + mapped_simplex = [get_key(v) for v in simplex_in_egonet] + append!(new_simplices, mapped_simplex) + push!(new_nverts, length(mapped_simplex)) + push!(new_times, time) + end + end + + return HONData(new_simplices, new_nverts, new_times, "egonet") +end + +function egonet_stats(dataset_name::String, num_egos::Int64) + # read data + dataset = read_txt_data(dataset_name) + A1, At1, B1 = basic_matrices(dataset.simplices, dataset.nverts) + + # Get eligible egos + n = size(B1, 1) + tri_order = proj_graph_degree_order(B1) + in_tri = zeros(Int64, n, Threads.nthreads()) + Threads.@threads for i = 1:n + for (j, k) in neighbor_pairs(B1, tri_order, i) + if B1[j, k] > 0 + tid = Threads.threadid() + in_tri[[i, j, k], tid] .= 1 + end + end + end + eligible_egos = findall(vec(sum(in_tri, dims=2)) .> 0) + num_eligible = length(eligible_egos) + println("$num_eligible eligible egos") + + # Sample from eligible egos + sampled_egos = + eligible_egos[StatsBase.sample(1:length(eligible_egos), + num_egos, replace=false)] + + # Collect statistics + X = zeros(Float64, NUM_FEATS, length(sampled_egos)) + for (j, ego) in enumerate(sampled_egos) + print(stdout, "$j \r") + flush(stdout) + egonet = egonet_dataset(dataset, ego, B1) + A, At, B = basic_matrices(egonet.simplices, egonet.nverts) + + num_nodes = sum(sum(At, dims=1) .> 0) + no, nc = num_open_closed_triangles(A, At, B) + + # log average degree + X[LOG_AVE_DEG, j] = log.(nnz(B) / num_nodes) + # log edge density + X[LOG_DENSITY, j] = log.(nnz(B) / (num_nodes^2 - num_nodes)) + # frac. open tris + X[FRAC_OPEN, j] = no / (no + nc) + end + + return convert(SpFltMat, X') +end + +function collect_egonet_data(num_egos::Int64, trial::Int64) + Random.seed!(1234 * trial) # reproducibility + dataset_names = [row[1] for row in all_datasets_params()] + ndatasets = length(dataset_names) + X = zeros(Float64, 0, NUM_FEATS) + labels = Int64[] + for (ind, dname) in enumerate(dataset_names) + println("$dname...") + label = nothing + if (dname == "coauth-DBLP" || + dname == "coauth-MAG-Geology" || + dname == "coauth-MAG-History"); label = 0; + elseif (dname == "tags-stack-overflow" || + dname == "tags-math-sx" || + dname == "tags-ask-ubuntu"); label = 1; + elseif (dname == "threads-stack-overflow" || + dname == "threads-math-sx" || + dname == "threads-ask-ubuntu"); label = 2; + elseif (dname == "contact-high-school" || + dname == "contact-primary-school"); label = 3; + elseif (dname == "email-Eu" || + dname == "email-Enron"); label = 4; + end + if label != nothing + X = [X; egonet_stats(dname, num_egos)] + append!(labels, ones(Int64, num_egos) * label) + end + end + save("output/egonets/egonet-data-$trial.jld2", + Dict("X" => X, "labels" => labels)) +end + +function egonet_predict(feat_cols::Vector{Int64}) + accs_mlr = Float64[] + accs_rnd = Float64[] + + for trial in 1:20 + (X_train, X_test, y_train, y_test) = egonet_train_test_data(trial)[1:4] + @show typeof(X_train) + @show typeof(X_test) + @show typeof(y_train) + @show typeof(y_test) + model = LogisticRegression(fit_intercept=true, multi_class="multinomial", + C=10, solver="newton-cg", max_iter=10000) + ScikitLearn.fit!(model, X_train, y_train) + rand_prob = + sum([(sum(y_train .== l) / length(y_train))^2 for l in unique(y_train)]) + push!(accs_mlr, ScikitLearn.score(model, X_test, y_test)) + push!(accs_rnd, rand_prob) + end + + @printf("%0.2f +/- %0.2f\n", mean(accs_mlr), std(accs_mlr)) + @printf("%0.2f +/- %0.2f\n", mean(accs_rnd), std(accs_rnd)) +end diff --git a/lifecycle_analysis.jl b/lifecycle_analysis.jl index 7f6a6d0..f8d83a6 100644 --- a/lifecycle_analysis.jl +++ b/lifecycle_analysis.jl @@ -87,3 +87,4 @@ function lifecycle(dataset::HONData, u::Int64, v::Int64, w::Int64) println("$simplex_name: $simplex_nodes") end end +; diff --git a/open_triangle_prediction.jl b/open_triangle_prediction.jl index 4b68ba2..bf33e16 100644 --- a/open_triangle_prediction.jl +++ b/open_triangle_prediction.jl @@ -2,9 +2,9 @@ include("common.jl") using ScHoLP -using Combinatorics -using MAT -using PyCall, JLD, PyCallJLD +using LinearAlgebra +using Printf +using PyCall using ScikitLearn @sk_import linear_model: LogisticRegression @@ -14,12 +14,12 @@ const OUTDIR = "prediction-output" basename_str(dataset::String) = "$(OUTDIR)/$dataset-open-tris-80-100" function read_data(dataset::HONData, prcntl1::Int64, prcntl2::Int64) - fname = "$(OUTDIR)/$(dataset.name)-open-tris-$prcntl1-$prcntl2.mat" - data = matread(fname)["data"] + fname = "$(OUTDIR)/$(dataset.name)-open-tris-$prcntl1-$prcntl2.jld2" + data = load(fname)["data"] dataT = data' ntri = size(dataT, 2) - triangles = Vector{NTuple{3,Int64}}(ntri) - labels = Vector{Int64}(ntri) + triangles = Vector{NTuple{3,Int64}}(undef, ntri) + labels = Vector{Int64}(undef, ntri) for i in 1:ntri triangles[i] = (dataT[1, i], dataT[2, i], dataT[3, i]) labels[i] = dataT[4, i] @@ -29,13 +29,12 @@ end function write_scores(dataset::HONData, score_type::String, scores::Vector{Float64}) basename = basename_str(dataset.name) - matwrite("$basename-scores-$score_type.mat", - Dict("scores" => scores)) + save("$basename-scores-$score_type.jld2", Dict("scores" => scores)) end function read_scores(dataset::HONData, score_type::String) basename = basename_str(dataset.name) - data = matread("$basename-scores-$score_type.mat") + data = load("$basename-scores-$score_type.jld2") return convert(Vector{Float64}, data["scores"]) end @@ -54,11 +53,11 @@ function collect_local_scores(dataset::HONData) println("geometric mean...") write_scores(dataset, "geom_mean", geometric_mean(triangles, B)) - degrees = vec(sum(spones(B), 1)) + degrees = vec(sum(make_sparse_ones(B), dims=1)) println("projected graph preferential attachment...") write_scores(dataset, "proj_graph_PA", pref_attach3(triangles, degrees)) - simp_degrees = vec(sum(At, 1)) + simp_degrees = vec(sum(At, dims=1)) println("simplex preferential attachment...") write_scores(dataset, "simplex_PA", pref_attach3(triangles, simp_degrees)) @@ -85,29 +84,29 @@ function collect_walk_scores(dataset::HONData) println("Unweighted personalized Katz...") scores, S = PKatz3(triangles, B, true, dense_solve) write_scores(dataset, "UPKatz", scores) - matwrite("$basename-UPKatz.mat", Dict("S" => S)) + save("$basename-UPKatz.jld2", Dict("S" => S)) println("Weighted personalized Katz...") scores, S = PKatz3(triangles, B, false, dense_solve) write_scores(dataset, "WPKatz", scores) - matwrite("$basename-WPKatz.mat", Dict("S" => S)) + save("$basename-WPKatz.jld2", Dict("S" => S)) println("Unweighted personalized PageRank...") scores, S = PPR3(triangles, B, true, dense_solve) write_scores(dataset, "UPPR", scores) - matwrite("$basename-UPPR.mat", Dict("S" => S)) + save("$basename-UPPR.jld2", Dict("S" => S)) println("Weighted personalized PageRank...") scores, S = PPR3(triangles, B, false, dense_solve) write_scores(dataset, "WPPR", scores) - matwrite("$basename-WPPR.mat", Dict("S" => S)) + save("$basename-WPPR.jld2", Dict("S" => S)) end function collect_logreg_supervised_scores(dataset::HONData) function feature_matrix(triangles::Vector{NTuple{3,Int64}}, At::SpIntMat, B::SpIntMat) - degrees = vec(sum(spones(B), 1)) - simp_degrees = vec(sum(At, 1)) + degrees = vec(sum(make_sparse_ones(B), dims=1)) + simp_degrees = vec(sum(At, dims=1)) common_nbrs = common_neighbors_map(B, triangles) ntriangles = length(triangles) X = zeros(Float64, 26, ntriangles) @@ -124,9 +123,9 @@ function collect_logreg_supervised_scores(dataset::HONData) X[12, ind] = length(common_jk) X[13, ind] = length(intersect(common_ij, common_ik, common_jk)) X[14:22, ind] = log.(X[1:9, ind]) - X[23:26, ind] = log.(X[10:13, ind] + 1.0) + X[23:26, ind] = log.(X[10:13, ind] .+ 1.0) end - return X' + return Matrix(X') end triangles = read_data(dataset, 80, 100)[1] @@ -141,9 +140,8 @@ function collect_logreg_supervised_scores(dataset::HONData) train_simplices, train_nverts = split_data(simplices, nverts, times, 60, 80)[1:2] At_train, B_train = basic_matrices(train_simplices, train_nverts)[2:3] X_train = feature_matrix(train_triangles, At_train, B_train) - model = LogisticRegression(fit_intercept=true) + model = LogisticRegression(fit_intercept=true, solver="liblinear") ScikitLearn.fit!(model, X_train, val_labels) - JLD.save("$basename-LR-model.jld", "model", model) X = feature_matrix(triangles, At, B) learned_scores = ScikitLearn.predict_proba(model, X)[:, 2] write_scores(dataset, "logreg_supervised", learned_scores) @@ -156,10 +154,10 @@ function collect_Simplicial_PPR_combined_scores(dataset::HONData) A = basic_matrices(old_simplices, old_nverts)[1] basename = basename_str(dataset.name) - (scores_comb, S_comb, edge_map) = Simplicial_PPR3_combined(triangles, A, 0.85) + (scores_comb, S_comb, edge_map) = Simplicial_PPR3_combined(triangles, A, true, 0.85) write_scores(dataset, "SimpPPR_comb", scores_comb) - matwrite("$basename-SimpPPR_comb.mat", - Dict("S" => S_comb, "edge_map" => edge_map)) + save("$basename-SimpPPR_comb.jld2", + Dict("S" => S_comb, "edge_map" => edge_map)) end function collect_Simplicial_PPR_decomposed_scores(dataset::HONData) @@ -171,19 +169,19 @@ function collect_Simplicial_PPR_decomposed_scores(dataset::HONData) (scores_comb, scores_curl, scores_grad, scores_harm, S_comb, S_curl, S_grad, S_harm, edge_map) = - Simplicial_PPR3_decomposed(triangles, A, false, 0.85) + Simplicial_PPR3_decomposed(triangles, A, true, 0.85) write_scores(dataset, "SimpPPR_comb", scores_comb) write_scores(dataset, "SimpPPR_grad", scores_grad) write_scores(dataset, "SimpPPR_curl", scores_curl) - write_scores(dataset, "SimpPPR_harm", scores_harm) - matwrite("$basename-SimpPPR_comb.mat", - Dict("S" => S_comb, "edge_map" => edge_map)) - matwrite("$basename-SimpPPR_grad.mat", - Dict("S" => S_grad, "edge_map" => edge_map)) - matwrite("$basename-SimpPPR_curl.mat", - Dict("S" => S_curl, "edge_map" => edge_map)) - matwrite("$basename-SimpPPR_harm.mat", - Dict("S" => S_harm, "edge_map" => edge_map)) + write_scores(dataset, "SimpPPR_harm", scores_harm) + save("$basename-SimpPPR_comb.jld2", + Dict("S" => S_comb, "edge_map" => edge_map)) + save("$basename-SimpPPR_grad.jld2", + Dict("S" => S_grad, "edge_map" => edge_map)) + save("$basename-SimpPPR_curl.jld2", + Dict("S" => S_curl, "edge_map" => edge_map)) + save("$basename-SimpPPR_harm.jld2", + Dict("S" => S_harm, "edge_map" => edge_map)) end function collect_generalized_means(dataset::HONData) @@ -205,21 +203,20 @@ function collect_generalized_means(dataset::HONData) push!(improvements, improvement) println("($p): $improvement") end - matwrite("$basename-genmeans-perf.mat", - Dict("improvements" => improvements, "ps" => ps)) + save("$basename-genmeans-perf.jld2", + Dict("improvements" => improvements, "ps" => ps)) return (ps, improvements) end function evaluate(dataset::HONData, score_types::Vector{String}) triangles, labels = read_data(dataset, 80, 100) rand_rate = sum(labels .== 1) / length(labels) - println(@sprintf("random: %0.2e", rand_rate)) + @printf("random: %0.2e\n", rand_rate) for score_type in score_types scores = read_scores(dataset, score_type) - assert(length(labels) == length(scores)) ave_prec = average_precision_score(labels, scores) improvement = ave_prec / rand_rate - println(@sprintf("%s: %0.2f", score_type, improvement)) + @printf("%s: %0.2f\n", score_type, improvement) end end @@ -238,14 +235,14 @@ Input parameters: """ function top_predictions(dataset::HONData, score_type::String, topk::Int64=10) triangles, labels = read_data(dataset, 80, 100) - scores = read_scores(dataset, score_type) + scores = read_scores(dataset, score_type) sp = sortperm(scores, alg=QuickSort, rev=true) node_labels = read_node_labels(dataset.name) for rank = 1:topk ind = sp[rank] i, j, k = triangles[ind] - println(@sprintf("%d (%f; %d): %s; %s; %s", rank, scores[ind], - labels[ind], node_labels[i], node_labels[j], node_labels[k])) + @printf("%d (%f; %d): %s; %s; %s\n", rank, scores[ind], + labels[ind], node_labels[i], node_labels[j], node_labels[k]) end end @@ -273,8 +270,8 @@ function collect_labeled_dataset(dataset::HONData) output_data[4, i] = (tri in new_closed_tris) end basename = basename_str(dataset.name) - matwrite("$(OUTDIR)/$(dataset.name)-$(output_name).mat", - Dict("data" => output_data')) + save("$(OUTDIR)/$(dataset.name)-$(output_name).jld2", + Dict("data" => output_data')) end old_simplices, old_nverts, new_simplices, new_nverts = diff --git a/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..db88256 Binary files /dev/null and b/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index e70d8f8..0000000 Binary files a/output/generalized-means/DAWN-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..42b7cb8 Binary files /dev/null and b/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 8cfa7b3..0000000 Binary files a/output/generalized-means/NDC-classes-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..4a155e2 Binary files /dev/null and b/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index b8d89c6..0000000 Binary files a/output/generalized-means/NDC-substances-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..2d1c5c0 Binary files /dev/null and b/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 27d61d4..0000000 Binary files a/output/generalized-means/coauth-DBLP-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..cb5a83d Binary files /dev/null and b/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index a0a6806..0000000 Binary files a/output/generalized-means/coauth-MAG-Geology-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..c6d3b7d Binary files /dev/null and b/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 2dd1acc..0000000 Binary files a/output/generalized-means/coauth-MAG-History-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..1e0be9f Binary files /dev/null and b/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index bbfa0cd..0000000 Binary files a/output/generalized-means/congress-bills-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/congress-committees-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/congress-committees-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 2a51ca6..0000000 Binary files a/output/generalized-means/congress-committees-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..1fa89a2 Binary files /dev/null and b/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index dbfaa58..0000000 Binary files a/output/generalized-means/contact-high-school-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..23ea57f Binary files /dev/null and b/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 7a47ade..0000000 Binary files a/output/generalized-means/contact-primary-school-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..1bde10b Binary files /dev/null and b/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 965e7c5..0000000 Binary files a/output/generalized-means/email-Enron-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..7d0e71e Binary files /dev/null and b/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 02fb972..0000000 Binary files a/output/generalized-means/email-Eu-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/music-rap-genius-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/music-rap-genius-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 5ab2b2b..0000000 Binary files a/output/generalized-means/music-rap-genius-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..6907699 Binary files /dev/null and b/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index dec4848..0000000 Binary files a/output/generalized-means/tags-ask-ubuntu-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..0c67299 Binary files /dev/null and b/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index cbb24fc..0000000 Binary files a/output/generalized-means/tags-math-sx-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..bf9a006 Binary files /dev/null and b/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 973148d..0000000 Binary files a/output/generalized-means/tags-stack-overflow-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..4a3199f Binary files /dev/null and b/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index b00e28f..0000000 Binary files a/output/generalized-means/threads-ask-ubuntu-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..cd6f47c Binary files /dev/null and b/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 303ade6..0000000 Binary files a/output/generalized-means/threads-math-sx-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.jld2 b/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.jld2 new file mode 100644 index 0000000..9aa1dbc Binary files /dev/null and b/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.jld2 differ diff --git a/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.mat b/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.mat deleted file mode 100644 index 9fda3a9..0000000 Binary files a/output/generalized-means/threads-stack-overflow-open-tris-80-100-genmeans-perf.mat and /dev/null differ diff --git a/output/simplex-size-dists/DAWN-simplex-size-dist.jld2 b/output/simplex-size-dists/DAWN-simplex-size-dist.jld2 new file mode 100644 index 0000000..bc3c3a2 Binary files /dev/null and b/output/simplex-size-dists/DAWN-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/DAWN-simplex-size-dist.mat b/output/simplex-size-dists/DAWN-simplex-size-dist.mat deleted file mode 100644 index c20c1b6..0000000 Binary files a/output/simplex-size-dists/DAWN-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/NDC-classes-simplex-size-dist.jld2 b/output/simplex-size-dists/NDC-classes-simplex-size-dist.jld2 new file mode 100644 index 0000000..578a487 Binary files /dev/null and b/output/simplex-size-dists/NDC-classes-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/NDC-classes-simplex-size-dist.mat b/output/simplex-size-dists/NDC-classes-simplex-size-dist.mat deleted file mode 100644 index 376a45b..0000000 Binary files a/output/simplex-size-dists/NDC-classes-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/NDC-substances-simplex-size-dist.jld2 b/output/simplex-size-dists/NDC-substances-simplex-size-dist.jld2 new file mode 100644 index 0000000..7d24aef Binary files /dev/null and b/output/simplex-size-dists/NDC-substances-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/NDC-substances-simplex-size-dist.mat b/output/simplex-size-dists/NDC-substances-simplex-size-dist.mat deleted file mode 100644 index 680a207..0000000 Binary files a/output/simplex-size-dists/NDC-substances-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.jld2 b/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.jld2 new file mode 100644 index 0000000..140a0dc Binary files /dev/null and b/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.mat b/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.mat deleted file mode 100644 index d3e0649..0000000 Binary files a/output/simplex-size-dists/coauth-DBLP-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.jld2 b/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.jld2 new file mode 100644 index 0000000..9ada526 Binary files /dev/null and b/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.mat b/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.mat deleted file mode 100644 index 303676e..0000000 Binary files a/output/simplex-size-dists/coauth-MAG-Geology-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.jld2 b/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.jld2 new file mode 100644 index 0000000..7b3939b Binary files /dev/null and b/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.mat b/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.mat deleted file mode 100644 index 5c5ed6f..0000000 Binary files a/output/simplex-size-dists/coauth-MAG-History-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/congress-bills-simplex-size-dist.jld2 b/output/simplex-size-dists/congress-bills-simplex-size-dist.jld2 new file mode 100644 index 0000000..482c274 Binary files /dev/null and b/output/simplex-size-dists/congress-bills-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/congress-bills-simplex-size-dist.mat b/output/simplex-size-dists/congress-bills-simplex-size-dist.mat deleted file mode 100644 index c3a8f48..0000000 Binary files a/output/simplex-size-dists/congress-bills-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/congress-committees-simplex-size-dist.mat b/output/simplex-size-dists/congress-committees-simplex-size-dist.mat deleted file mode 100644 index f58cceb..0000000 Binary files a/output/simplex-size-dists/congress-committees-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/contact-high-school-simplex-size-dist.jld2 b/output/simplex-size-dists/contact-high-school-simplex-size-dist.jld2 new file mode 100644 index 0000000..6cac696 Binary files /dev/null and b/output/simplex-size-dists/contact-high-school-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/contact-high-school-simplex-size-dist.mat b/output/simplex-size-dists/contact-high-school-simplex-size-dist.mat deleted file mode 100644 index 6638b3e..0000000 Binary files a/output/simplex-size-dists/contact-high-school-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/contact-primary-school-simplex-size-dist.jld2 b/output/simplex-size-dists/contact-primary-school-simplex-size-dist.jld2 new file mode 100644 index 0000000..c23ea93 Binary files /dev/null and b/output/simplex-size-dists/contact-primary-school-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/contact-primary-school-simplex-size-dist.mat b/output/simplex-size-dists/contact-primary-school-simplex-size-dist.mat deleted file mode 100644 index 279c888..0000000 Binary files a/output/simplex-size-dists/contact-primary-school-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/email-Enron-simplex-size-dist.jld2 b/output/simplex-size-dists/email-Enron-simplex-size-dist.jld2 new file mode 100644 index 0000000..c8e8de4 Binary files /dev/null and b/output/simplex-size-dists/email-Enron-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/email-Enron-simplex-size-dist.mat b/output/simplex-size-dists/email-Enron-simplex-size-dist.mat deleted file mode 100644 index b94f798..0000000 Binary files a/output/simplex-size-dists/email-Enron-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/email-Eu-simplex-size-dist.jld2 b/output/simplex-size-dists/email-Eu-simplex-size-dist.jld2 new file mode 100644 index 0000000..15736a1 Binary files /dev/null and b/output/simplex-size-dists/email-Eu-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/email-Eu-simplex-size-dist.mat b/output/simplex-size-dists/email-Eu-simplex-size-dist.mat deleted file mode 100644 index b4554a8..0000000 Binary files a/output/simplex-size-dists/email-Eu-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/music-rap-genius-simplex-size-dist.mat b/output/simplex-size-dists/music-rap-genius-simplex-size-dist.mat deleted file mode 100644 index 2f8730d..0000000 Binary files a/output/simplex-size-dists/music-rap-genius-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.jld2 b/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.jld2 new file mode 100644 index 0000000..89b829f Binary files /dev/null and b/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.mat b/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.mat deleted file mode 100644 index b1a0d10..0000000 Binary files a/output/simplex-size-dists/tags-ask-ubuntu-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/tags-math-sx-simplex-size-dist.jld2 b/output/simplex-size-dists/tags-math-sx-simplex-size-dist.jld2 new file mode 100644 index 0000000..350a712 Binary files /dev/null and b/output/simplex-size-dists/tags-math-sx-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/tags-math-sx-simplex-size-dist.mat b/output/simplex-size-dists/tags-math-sx-simplex-size-dist.mat deleted file mode 100644 index bfdeed8..0000000 Binary files a/output/simplex-size-dists/tags-math-sx-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.jld2 b/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.jld2 new file mode 100644 index 0000000..5f09ab4 Binary files /dev/null and b/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.mat b/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.mat deleted file mode 100644 index e30ccd2..0000000 Binary files a/output/simplex-size-dists/tags-stack-overflow-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.jld2 b/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.jld2 new file mode 100644 index 0000000..1652187 Binary files /dev/null and b/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.mat b/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.mat deleted file mode 100644 index 66cf5bd..0000000 Binary files a/output/simplex-size-dists/threads-ask-ubuntu-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/threads-math-sx-simplex-size-dist.jld2 b/output/simplex-size-dists/threads-math-sx-simplex-size-dist.jld2 new file mode 100644 index 0000000..20b6881 Binary files /dev/null and b/output/simplex-size-dists/threads-math-sx-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/threads-math-sx-simplex-size-dist.mat b/output/simplex-size-dists/threads-math-sx-simplex-size-dist.mat deleted file mode 100644 index 80381f6..0000000 Binary files a/output/simplex-size-dists/threads-math-sx-simplex-size-dist.mat and /dev/null differ diff --git a/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.jld2 b/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.jld2 new file mode 100644 index 0000000..bfd0bbb Binary files /dev/null and b/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.jld2 differ diff --git a/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.mat b/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.mat deleted file mode 100644 index ce5b671..0000000 Binary files a/output/simplex-size-dists/threads-stack-overflow-simplex-size-dist.mat and /dev/null differ diff --git a/output/simulation/simulation.jld2 b/output/simulation/simulation.jld2 new file mode 100644 index 0000000..7ec3457 Binary files /dev/null and b/output/simulation/simulation.jld2 differ diff --git a/output/simulation/simulation.mat b/output/simulation/simulation.mat deleted file mode 100644 index ff2f5e6..0000000 Binary files a/output/simulation/simulation.mat and /dev/null differ diff --git a/output/summary-stats/coauth-MAG-geology-statistics.csv b/output/summary-stats/coauth-MAG-Geology-statistics.csv similarity index 100% rename from output/summary-stats/coauth-MAG-geology-statistics.csv rename to output/summary-stats/coauth-MAG-Geology-statistics.csv diff --git a/output/summary-stats/coauth-MAG-history-statistics.csv b/output/summary-stats/coauth-MAG-History-statistics.csv similarity index 100% rename from output/summary-stats/coauth-MAG-history-statistics.csv rename to output/summary-stats/coauth-MAG-History-statistics.csv diff --git a/paper_plots.jl b/paper_plots.jl index 07ad5d7..7ab136f 100644 --- a/paper_plots.jl +++ b/paper_plots.jl @@ -1,11 +1,14 @@ include("common.jl") +using CSV using DataFrames -using MAT using PyPlot using PyCall @pyimport matplotlib.patches as patch +using ScikitLearn +@sk_import linear_model: LogisticRegression + function dataset_structure_plots() plot_params = all_datasets_params() datasets = [row[1] for row in plot_params] @@ -17,7 +20,7 @@ function dataset_structure_plots() density3 = Float64[] ave_deg3 = Float64[] for dataset in datasets - data = readtable("output/summary-stats/$dataset-statistics.csv") + data = CSV.read("output/summary-stats/$dataset-statistics.csv") no = data[1, :nopentri] nc = data[1, :nclosedtri] push!(frac_open, no / (no + nc)) @@ -88,8 +91,8 @@ function dataset_structure_plots() show() end -function simulation_plots() - data = matread("output/simulation/simulation.mat") +function simulation_plot() + data = load("output/simulation/simulation.jld2") all_n = data["n"] all_b = data["b"] all_density = data["density"] @@ -97,61 +100,30 @@ function simulation_plots() all_frac_open = data["frac_open"] close() - + figure() # Edge density - subplot(221) for (n, cm, marker) in [(200, ColorMap("Purples"), "d"), (100, ColorMap("Reds"), "<"), (50, ColorMap("Greens"), "s"), (25, ColorMap("Blues"), "o"), ] - inds = find(all_n .== n) + inds = findall(all_n .== n) curr_b = all_b[inds] density = all_density[inds] frac_open = all_frac_open[inds] - scatter(density, frac_open, c=curr_b, marker=marker, label="$n", s=6, - vmin=minimum(curr_b) - 0.5, vmax=maximum(curr_b) + 0.5, cmap=cm,) - end - ax = gca() - ax[:set_xscale]("log") - fsz = 10 - xlabel("Edge density in projected graph", fontsize=fsz) - ylabel("Fraction of triangles open", fontsize=fsz) - title("Exactly 3 nodes per simplex (simulated)", fontsize=fsz) - - # Average degree - subplot(223) - for (n, cm, marker) in [(200, ColorMap("Purples"), "d"), - (100, ColorMap("Reds"), "<"), - (50, ColorMap("Greens"), "s"), - (25, ColorMap("Blues"), "o"), - ] - inds = find(all_n .== n) - curr_b = all_b[inds] - ave_deg = all_ave_deg[inds] - frac_open = all_frac_open[inds] - scatter(ave_deg, frac_open, c=curr_b, marker=marker, label="$n", s=6, + scatter(density, frac_open, c=curr_b, marker=marker, label="$n", s=14, vmin=minimum(curr_b) - 0.5, vmax=maximum(curr_b) + 0.5, cmap=cm) + end ax = gca() ax[:set_xscale]("log") - fsz = 10 - xlabel("Average degree in projected graph", fontsize=fsz) + fsz = 20 + ax[:tick_params]("both", labelsize=fsz-5, length=5, width=1) + legend() + xlabel("Edge density in projected graph", fontsize=fsz) ylabel("Fraction of triangles open", fontsize=fsz) - title("Exactly 3 nodes per simplex (simulated)", fontsize=fsz) - - # legend - subplot(224) - for (n, color, marker) in [(200, "purple", "d"), - (100, "red", "<"), - (50, "green", "s"), - (25, "blue", "o"), - ] - scatter([1], [1], marker=marker, color=color, label="n = $n") - end - legend() tight_layout() - savefig("simulation.pdf") + savefig("simulation.pdf", bbox_inches="tight") show() end @@ -165,14 +137,16 @@ function simplex_size_dist_plot() subplot(221) for i in 1:length(datasets) dataset = datasets[i] - data = matread("output/simplex-size-dists/$dataset-simplex-size-dist.mat") - nvert = data["nvert"] - counts = data["counts"] - tot = sum(counts) - fracs = [count / tot for count in counts] - ms = (length(dataset) > 8 && dataset[1:8] == "congress") ? 6 : 2 - loglog(nvert, fracs, marker=markers[i], color=colors[i], - linewidth=0.5, markersize=ms) + if dataset != "congress-committees" && dataset != "music-rap-genius" + data = load("output/simplex-size-dists/$dataset-simplex-size-dist.jld2") + nvert = data["nvert"] + counts = data["counts"] + tot = sum(counts) + fracs = [count / tot for count in counts] + ms = 4 + loglog(nvert, fracs, marker=markers[i], color=colors[i], + linewidth=0.5, markersize=ms) + end end fsz = 10 xlabel("Number of nodes in simplex", fontsize=fsz) @@ -195,16 +169,16 @@ function min_max_val(probs1::Vector{Float64}, probs2::Vector{Float64}) return (minimum([p for p in probs if p > 0]), maximum(probs)) end -function closure_probs_heat_map(simplex_size::Int64) +function closure_probs_heat_map(simplex_size::Int64, initial_cutoff::Int64=100) plot_params = all_datasets_params() datasets = [param[1] for param in plot_params] - keys, nsamples, nclosed = read_closure_stats(datasets[1], simplex_size) + keys, nsamples, nclosed = read_closure_stats(datasets[1], simplex_size, initial_cutoff) probs = nclosed ./ nsamples P = zeros(length(datasets), length(keys)) insufficient_sample_inds = [] for (ind, dataset) in enumerate(datasets) - keys, nsamples, nclosed = read_closure_stats(dataset, simplex_size) + keys, nsamples, nclosed = read_closure_stats(dataset, simplex_size, initial_cutoff) P[ind, :] = nclosed ./ nsamples for (key_ind, (key, nsamp)) in enumerate(zip(keys, nsamples)) if nsamp <= 20 @@ -217,7 +191,7 @@ function closure_probs_heat_map(simplex_size::Int64) PyPlot.pygui(true) minval = max(1e-9, minimum([v for v in P[:] if v > 0])) - P[P[:] .== 0] = minval + P[P[:] .== 0] .= minval for (i, j) in insufficient_sample_inds; P[i, j] = 0; end cm = ColorMap("Blues") @@ -234,16 +208,14 @@ function closure_probs_heat_map(simplex_size::Int64) facecolor=gray)) end ax[:set_yticks](0:(length(datasets)-1)) - #ax[:set_yticklabels](datasets, rotation=10, fontsize=(simplex_size == 4 ? 4 : 5)) ax[:set_yticklabels](datasets, rotation=10, fontsize=(simplex_size == 4 ? 4 : 7)) ax[:set_xticks](0:(length(probs)-1)) ax[:tick_params](axis="both", length=3) ax[:set_xticklabels](["" for _ in 0:(length(probs)-1)]) - #cb = colorbar(orientation="horizontal") cb = colorbar() - cb[:ax][:tick_params](labelsize=(simplex_size == 4 ? 18 : 9)) + cb[:ax][:tick_params](labelsize=9) tight_layout() - savefig("closure-probs-$(simplex_size).pdf") + savefig("closure-probs-$(simplex_size)-$(initial_cutoff).pdf") show() end @@ -347,7 +319,7 @@ function four_node_scatter_plot() minval, maxval = min_max_val(probs0111, probs22) loglog([minval, maxval], [minval, maxval], "black", lw=0.5) for i in 1:length(datasets) - loglog(probs0111[i], probs22[i], markers[i], color=colors[i]) + loglog(probs22[i], probs0111[i], markers[i], color=colors[i]) end xlabel("Closure probability (0111)", fontsize=fsz) ylabel("Closure probability (22)", fontsize=fsz) @@ -368,7 +340,7 @@ function generalized_means_plot() dataset = param[1] if dataset in datasets basename = "output/generalized-means/$dataset-open-tris-80-100" - data = matread("$basename-genmeans-perf.mat") + data = load("$basename-genmeans-perf.jld2") ps = data["ps"] improvements = data["improvements"] plot(ps[2:end-1], improvements[2:end-1], @@ -381,13 +353,13 @@ function generalized_means_plot() ax[:set_xticks](-4:1:4) ax[:tick_params](axis="both", length=3) end - + set1 = ["threads-stack-overflow", "threads-math-sx", "threads-ask-ubuntu"] - set2 = ["tags-stack-overflow", "tags-math-sx", "tags-ask-ubuntu", "music-rap-genius", + set2 = ["tags-stack-overflow", "tags-math-sx", "tags-ask-ubuntu", "contact-high-school", "contact-primary-school", "DAWN", "NDC-substances", "NDC-classes"] set3 = ["coauth-MAG-History", "coauth-MAG-Geology", "coauth-DBLP", - "email-Enron", "email-Eu", "congress-committees", "congress-bills"] + "email-Enron", "email-Eu", "congress-bills"] subplot(221) make_subplot(set1) legend(fontsize=fsz) @@ -395,9 +367,72 @@ function generalized_means_plot() make_subplot(set2) subplot(223) make_subplot(set3) - tight_layout() savefig("generalized-means-perf.pdf") show() end +function logreg_decision_boundary(trial::Int64=1) + (X, _, y, _, yf, _) = egonet_train_test_data(trial) + X = X[:, [LOG_AVE_DEG, FRAC_OPEN]] + model = LogisticRegression(fit_intercept=true, multi_class="multinomial", C=10, + solver="newton-cg", max_iter=1000) + ScikitLearn.fit!(model, X, y) + + dim = 500 + minval1, maxval1 = minimum(X[:, 1]) - 0.5, maximum(X[:, 1]) * 1.02 + minval2, maxval2 = minimum(X[:, 2]) - 0.01, maximum(X[:, 2]) + 0.05 + grid_feats = zeros(Float64, 2, dim * dim) + grid_ind = 1 + xx = [(i - 1) * (maxval1 - minval1) / dim + minval1 for i in 1:dim] + yy = [(j - 1) * (maxval2 - minval2) / dim + minval2 for j in 1:dim] + for x in xx, y in yy + grid_feats[1, grid_ind] = x + grid_feats[2, grid_ind] = y + grid_ind += 1 + end + + close() + figure() + Z = reshape(ScikitLearn.predict(model, Matrix(grid_feats')), dim, dim) + labels = Dict(0 => "coauthorship", 1 => "tags", 2 => "threads", + 3 => "contact", 4 => "email") + greys = ["#f7f7f7", "#d9d9d9", "#bdbdbd", "#969696", "#636363"] + contourf(exp.(xx), yy, Z, colors=greys) + params = all_datasets_params() + label2domain = Dict(0 => 0, 1 => 0, 2 => 0, + 3 => -1, + 4 => 1, 5 => 1, 6 => 1, + 7 => 2, 8 => 2, 9 => 2, + 10 => -1, 11 => -1, 12 => -1, 13 => -1, 14 => -1, + 15 => 3, 16 => 3, + 17 => 4, 18 => 4) + colors_full = ["#ed5e5f", "#e41a1c", "#9f1214", + "no-op", + "#69a3d2", "#377eb8", "#25567d", + "#80c87d", "#4daf4a", "#357933", + "no-op", "no-op", "no-op", "no-op", "no-op", + "#984ea3", "#68356f", + "#d37a48", "#a65628"] + for label in sort(unique(yf)) + inds = findall(yf .== label) + scatter(exp.(X[inds, 1]), X[inds, 2], + color=colors_full[label], + marker="o", + label=params[label][1], + s=14) + end + fsz = 18 + legend(fontsize=fsz-4) + ax = gca() + ax[:set_xscale]("log") + xlabel("Average degree", fontsize=fsz) + ylabel("Fraction of triangles open", fontsize=fsz) + title("Decision boundary", fontsize=fsz) + ax[:set_xlim](1.8, 400) + ax[:tick_params](axis="both", length=3, labelsize=14) + tight_layout() + savefig("decision.pdf") + show() +end +; diff --git a/precompute_genmeans.jl b/precompute_genmeans.jl new file mode 100644 index 0000000..a3c24cc --- /dev/null +++ b/precompute_genmeans.jl @@ -0,0 +1,13 @@ +include("open_triangle_prediction.jl") + +function main() + datasets = [row[1] for row in all_datasets_params()] + for dataset in datasets + println("$dataset...") + if dataset != "congress-committees" && dataset != "music-rap-genius" + hd = read_txt_data(dataset) + collect_labeled_dataset(hd) + collect_generalized_means(hd) + end + end +end diff --git a/precompute_simplex_sizes.jl b/precompute_simplex_sizes.jl new file mode 100644 index 0000000..8347c95 --- /dev/null +++ b/precompute_simplex_sizes.jl @@ -0,0 +1,21 @@ +include("common.jl") + +function main() + datasets = [row[1] for row in all_datasets_params()] + for dataset in datasets + println("$dataset...") + if dataset != "congress-committees" && dataset != "music-rap-genius" + hd = read_txt_data(dataset) + nv_counts = Dict{Int64,Int64}() + for v in hd.nverts + if !haskey(nv_counts, v); nv_counts[v] = 1 + else nv_counts[v] += 1 + end + end + nverts = [x[1] for x in nv_counts] + counts = [x[2] for x in nv_counts] + save("output/simplex-size-dists/$dataset-simplex-size-dist.jld2", + Dict("nvert" => nverts, "counts" => counts)) + end + end +end diff --git a/simulations.jl b/simulations.jl index f610803..aea0d11 100644 --- a/simulations.jl +++ b/simulations.jl @@ -1,8 +1,7 @@ include("common.jl") -using Combinatorics using Distributions -using MAT +using SparseArrays function simulate_summary_stats(n::Int64, p::Float64) bin = Binomial(binomial(n, 3), p) @@ -47,7 +46,7 @@ function simulate() end end end - matwrite("simulation.mat", - Dict("n" => all_n, "b" => all_b, "density" => all_density, - "ave_deg" => all_ave_deg, "frac_open" => all_frac_open)) + save("output/simulation/simulation.jld2", + Dict("n" => all_n, "b" => all_b, "density" => all_density, + "ave_deg" => all_ave_deg, "frac_open" => all_frac_open)) end diff --git a/statistical_tests_and_models.jl b/statistical_tests_and_models.jl index 55aaf5f..c22586b 100644 --- a/statistical_tests_and_models.jl +++ b/statistical_tests_and_models.jl @@ -1,6 +1,9 @@ include("common.jl") + +using CSV using HypothesisTests using DataFrames +using Printf using GLM function simplicial_closure_tests(significance::Float64=1e-5, X::Int64=100, only_3_node::Bool=false) @@ -63,11 +66,11 @@ function simplicial_closure_tests(significance::Float64=1e-5, X::Int64=100, only total += 1 end end - println(@sprintf("%s (left): %d of %d tests significant at < %g", - test_type, sig_count1, total, significance)) - println(@sprintf("%s (right): %d of %d tests significant at < %g", - test_type, sig_count2, total, significance)) - println(@sprintf("%s (raw): %d of %d", test_type, raw, total)) + @printf("%s (left): %d of %d tests significant at < %g\n", + test_type, sig_count1, total, significance) + @printf("%s (right): %d of %d tests significant at < %g\n", + test_type, sig_count2, total, significance) + @printf("%s (raw): %d of %d\n", test_type, raw, total) end end @@ -78,7 +81,7 @@ function fracopen_logavedeg_linear_models() frac_open3 = Float64[] ave_deg3 = Float64[] for dataset in datasets - data = readtable("output/summary-stats/$dataset-statistics.csv") + data = CSV.read("output/summary-stats/$dataset-statistics.csv") no = data[1, :nopentri] nc = data[1, :nclosedtri] push!(frac_open, no / (no + nc)) @@ -102,3 +105,4 @@ function fracopen_logavedeg_linear_models() model3 = lm(@formula(Y ~ X), data3) return (model, model3) end +;