Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V1.0 #2

Merged
merged 38 commits into from
Nov 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
9b72ffd
Making updates for Julia 1.0
arbenson Nov 14, 2018
8b2158e
Updates for 1.0
arbenson Nov 15, 2018
4d6bec2
update
arbenson Nov 15, 2018
3097911
Update for v1.0
arbenson Nov 15, 2018
a1531fd
consistent case-naming of datasets
arbenson Nov 15, 2018
101e0c9
Update simplex size distribution files
arbenson Nov 15, 2018
26d6ce5
New simulation file
arbenson Nov 15, 2018
fc0b193
Use JLD2
arbenson Nov 15, 2018
45b6234
Supress
arbenson Nov 15, 2018
77522ee
Update paper plots for v1.0
arbenson Nov 15, 2018
d7d12c8
Continuing to prepare for v1.0
arbenson Nov 16, 2018
9e33b77
More generalized means
arbenson Nov 16, 2018
53b7ce6
Move items to common.jl
arbenson Nov 16, 2018
c4743c8
Move items to common.jl
arbenson Nov 16, 2018
4d3fbc4
Move items to common.jl
arbenson Nov 16, 2018
9359e7d
Putting in function to plot the decision boundary of the logistic reg…
arbenson Nov 16, 2018
b93a667
Moving stuff to common.jl
arbenson Nov 16, 2018
55f27af
Egonet analysis code.
arbenson Nov 23, 2018
11a130c
Fix tuple formation.
arbenson Nov 24, 2018
9afdd59
Supress output
arbenson Nov 24, 2018
47fc0a6
Minor tweaks
arbenson Nov 24, 2018
bc998a7
Continuing to update README
arbenson Nov 24, 2018
0ba956d
Generalized means for tags-stack-overflow
arbenson Nov 24, 2018
e4c9e4a
coauth-DBLP data
arbenson Nov 24, 2018
a371d45
Update heat maps
arbenson Nov 24, 2018
1e71021
Update README with SI figures
arbenson Nov 24, 2018
b54279e
Extra line for gitignore
arbenson Nov 24, 2018
fe12d59
Make dataset reading more efficient
arbenson Nov 24, 2018
2afd4aa
Merge branch 'v1.0' of https://github.com/arbenson/ScHoLP-Tutorial in…
arbenson Nov 24, 2018
7039149
Tweaks for 1.0
arbenson Nov 24, 2018
7b20f3a
Last generalized means dataset
arbenson Nov 24, 2018
94baec8
Update generalized mean plot
arbenson Nov 24, 2018
9dad0d7
A few more things for the readme
arbenson Nov 24, 2018
d8dd266
More 1.0 updates
arbenson Nov 24, 2018
86660c0
Update egonets
arbenson Nov 25, 2018
80fa2a7
Various 1.0 updates
arbenson Nov 25, 2018
d71edcb
Revert to dense solve.
arbenson Nov 25, 2018
61eda25
Get egonet experiments finished
arbenson Nov 25, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*~
279 changes: 188 additions & 91 deletions README.md

Large diffs are not rendered by default.

49 changes: 47 additions & 2 deletions common.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,28 @@
using Base.Threads
using Combinatorics
using DelimitedFiles
using FileIO
using JLD2
using Random
using ScHoLP
using SparseArrays
using StatsBase

const NUM_FEATS = 3
const LOG_AVE_DEG = 1
const LOG_DENSITY = 2
const FRAC_OPEN = 3

function read_txt_data(dataset::String)
read(filename::String) = convert(Vector{Int64}, readdlm(filename, Int64)[:, 1])
function read(filename::String)
ret = Int64[]
open(filename) do f
for line in eachline(f)
push!(ret, parse(Int64, line))
end
end
return ret
end
return HONData(read("data/$(dataset)/$(dataset)-simplices.txt"),
read("data/$(dataset)/$(dataset)-nverts.txt"),
read("data/$(dataset)/$(dataset)-times.txt"),
Expand Down Expand Up @@ -40,13 +58,39 @@ function read_closure_stats(dataset::String, simplex_size::Int64, initial_cutoff
end
for row_ind in 1:size(data, 1)
row = convert(Vector{Int64}, data[row_ind, :])
push!(keys, (row[1:simplex_size]...))
push!(keys, tuple(row[1:simplex_size]...))
push!(nsamples, row[end - 1])
push!(nclosed, row[end])
end
return (keys, nsamples, nclosed)
end

function egonet_train_test_data(trial::Int64)
Random.seed!(444) # for reproducibility
data = load("output/egonets/egonet-data-$trial.jld2")
X = data["X"]
y = data["labels"]
yf = data["full_labels"]
inds = randperm(length(y))
X = X[inds, :]
y = y[inds]
yf = yf[inds]

train_inds = Int64[]
test_inds = Int64[]
for label in sort(unique(y))
inds = findall(y .== label)
end_ind = convert(Int64, round(length(inds) * 0.8))
append!(train_inds, inds[1:end_ind])
append!(test_inds, inds[(end_ind + 1):end])
end

X_train, X_test = X[train_inds, :], X[test_inds, :]
y_train, y_test = y[train_inds], y[test_inds]
yf_train, yf_test = yf[train_inds], yf[test_inds]
return (X_train, X_test, y_train, y_test, yf_train, yf_test)
end

# This is just a convenient wrapper around all of the formatting parameters for
# making plots.
function all_datasets_params()
Expand Down Expand Up @@ -75,3 +119,4 @@ function all_datasets_params()
]
return plot_params
end
;
14 changes: 14 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
DAWN
NDC-substances
coauth-DBLP
coauth-MAG-Geology
congress-bills
contact-high-school
contact-primary-school
email-Eu
tags-ask-ubuntu
tags-math-sx
tags-stack-overflow
threads-ask-ubuntu
threads-math-sx
threads-stack-overflow
149 changes: 149 additions & 0 deletions egonet_analysis.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
include("common.jl")

using DataFrames
using GLM
using Printf
using Random
using SparseArrays
using Statistics

using ScikitLearn
@sk_import linear_model: LogisticRegression

# Construct HONData for a given ego
function egonet_dataset(dataset::HONData, ego::Int64, B::SpIntMat)
in_egonet = zeros(Bool, size(B, 1))
in_egonet[ego] = true
in_egonet[findnz(B[:, ego])[1]] .= true

node_map = Dict{Int64, Int64}()
function get_key(x::Int64)
if haskey(node_map, x); return node_map[x]; end
n = length(node_map) + 1
node_map[x] = n
return n
end
ego_key = get_key(ego)

new_simplices = Int64[]
new_nverts = Int64[]
new_times = Int64[]
curr_ind = 1
for (nvert, time) in zip(dataset.nverts, dataset.times)
end_ind = curr_ind + nvert - 1
simplex = dataset.simplices[curr_ind:end_ind]
curr_ind += nvert
simplex_in_egonet = [v for v in simplex if in_egonet[v]]
if length(simplex_in_egonet) > 0
mapped_simplex = [get_key(v) for v in simplex_in_egonet]
append!(new_simplices, mapped_simplex)
push!(new_nverts, length(mapped_simplex))
push!(new_times, time)
end
end

return HONData(new_simplices, new_nverts, new_times, "egonet")
end

function egonet_stats(dataset_name::String, num_egos::Int64)
# read data
dataset = read_txt_data(dataset_name)
A1, At1, B1 = basic_matrices(dataset.simplices, dataset.nverts)

# Get eligible egos
n = size(B1, 1)
tri_order = proj_graph_degree_order(B1)
in_tri = zeros(Int64, n, Threads.nthreads())
Threads.@threads for i = 1:n
for (j, k) in neighbor_pairs(B1, tri_order, i)
if B1[j, k] > 0
tid = Threads.threadid()
in_tri[[i, j, k], tid] .= 1
end
end
end
eligible_egos = findall(vec(sum(in_tri, dims=2)) .> 0)
num_eligible = length(eligible_egos)
println("$num_eligible eligible egos")

# Sample from eligible egos
sampled_egos =
eligible_egos[StatsBase.sample(1:length(eligible_egos),
num_egos, replace=false)]

# Collect statistics
X = zeros(Float64, NUM_FEATS, length(sampled_egos))
for (j, ego) in enumerate(sampled_egos)
print(stdout, "$j \r")
flush(stdout)
egonet = egonet_dataset(dataset, ego, B1)
A, At, B = basic_matrices(egonet.simplices, egonet.nverts)

num_nodes = sum(sum(At, dims=1) .> 0)
no, nc = num_open_closed_triangles(A, At, B)

# log average degree
X[LOG_AVE_DEG, j] = log.(nnz(B) / num_nodes)
# log edge density
X[LOG_DENSITY, j] = log.(nnz(B) / (num_nodes^2 - num_nodes))
# frac. open tris
X[FRAC_OPEN, j] = no / (no + nc)
end

return convert(SpFltMat, X')
end

function collect_egonet_data(num_egos::Int64, trial::Int64)
Random.seed!(1234 * trial) # reproducibility
dataset_names = [row[1] for row in all_datasets_params()]
ndatasets = length(dataset_names)
X = zeros(Float64, 0, NUM_FEATS)
labels = Int64[]
for (ind, dname) in enumerate(dataset_names)
println("$dname...")
label = nothing
if (dname == "coauth-DBLP" ||
dname == "coauth-MAG-Geology" ||
dname == "coauth-MAG-History"); label = 0;
elseif (dname == "tags-stack-overflow" ||
dname == "tags-math-sx" ||
dname == "tags-ask-ubuntu"); label = 1;
elseif (dname == "threads-stack-overflow" ||
dname == "threads-math-sx" ||
dname == "threads-ask-ubuntu"); label = 2;
elseif (dname == "contact-high-school" ||
dname == "contact-primary-school"); label = 3;
elseif (dname == "email-Eu" ||
dname == "email-Enron"); label = 4;
end
if label != nothing
X = [X; egonet_stats(dname, num_egos)]
append!(labels, ones(Int64, num_egos) * label)
end
end
save("output/egonets/egonet-data-$trial.jld2",
Dict("X" => X, "labels" => labels))
end

function egonet_predict(feat_cols::Vector{Int64})
accs_mlr = Float64[]
accs_rnd = Float64[]

for trial in 1:20
(X_train, X_test, y_train, y_test) = egonet_train_test_data(trial)[1:4]
@show typeof(X_train)
@show typeof(X_test)
@show typeof(y_train)
@show typeof(y_test)
model = LogisticRegression(fit_intercept=true, multi_class="multinomial",
C=10, solver="newton-cg", max_iter=10000)
ScikitLearn.fit!(model, X_train, y_train)
rand_prob =
sum([(sum(y_train .== l) / length(y_train))^2 for l in unique(y_train)])
push!(accs_mlr, ScikitLearn.score(model, X_test, y_test))
push!(accs_rnd, rand_prob)
end

@printf("%0.2f +/- %0.2f\n", mean(accs_mlr), std(accs_mlr))
@printf("%0.2f +/- %0.2f\n", mean(accs_rnd), std(accs_rnd))
end
1 change: 1 addition & 0 deletions lifecycle_analysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,4 @@ function lifecycle(dataset::HONData, u::Int64, v::Int64, w::Int64)
println("$simplex_name: $simplex_nodes")
end
end
;
Loading