In [None]:
include("init.jl")

In [None]:
import HTTP, Plots, LightGraphs, JSON, GitHub, Base64, MetaGraphs, GraphPlot, FileIO, TranscodingStreams, CodecZlib
const lg = LightGraphs
const mg = MetaGraphs

# Todo:
# Better wrapper for requests, dealing with "please slow down requests" etc

## GitHub crawler

Not remotely finished.

### Todo

- Error handling (esp rate limiting)
- Store in package database alongside NPM (see further down)
- Store more info than currently
- Get package.json automatically

In [None]:
table = Array{Any,2}
for page in 1:10 # NB: Only allowed 10 requests a minute without a key. 30 if we use one.
    r = HTTP.request(
        "GET",
        "https://api.github.com/search/repositories?q=language:javascript+language:typescript&sort=stars&page=$page",
        Dict("User-Agent"=>"node-conservation-society");
        #verbose=3
    )
    results = JSON.parse(r.body |> String)
    for hit in results["items"]
        table = [table; [hit["owner"]["login"]*"/"*hit["name"]  hit["stargazers_count"]]]
    end
end
table = table[2:end, :] # Don't ask why/how we get a strange type as the first column
#println(r.status)
#println(String(r.body))

# Need to provide a user agent via eg https://juliaweb.github.io/HTTP.jl/stable/index.html#HTTP.MessageRequest.setuseragent! as per https://developer.github.com/v3/#user-agent-required

In [None]:
Plots.histogram(
    table[2:end,2];
    legend=:none,
    yscale=:log10,
    ylabel="Count",
    xlabel="Stars"
)
# huh, this isn't a power law. weird.

In [None]:
#filter(k->occursin("star",k),results["items"][1]|>keys)

In [None]:
#toypackages = Dict(
#    "root" => Dict(
#        "devDependencies" => Dict(
#            "source" => "1.1",
#        ),
#    ),
#    "source" => getnpmdeets("fuse.js","3.3.0"),
#    #addnodeanddeps! takes a long time for fuse.
#)

In [None]:
# Load package data from file to stop us hammering NPM
toypackages = open("pkg_cache.json.gz","r") do f
    JSON.parse(String(TranscodingStreams.transcode(
            CodecZlib.GzipDecompressor,read(f)
    )))
end

In [None]:
function getpkgjson(user_slash_repo)
    # Todo: catch 404s etc
    try
        package = GitHub.file(user_slash_repo,"package.json")
        return JSON.parse(package.content |> split |> join |> Base64.base64decode |> String)
    catch(e)
        return Dict{String,Any}()
    end
end
function addghdeets!(user_slash_repo,pkgs=toypackages)
    pkgs[replace(user_slash_repo,r".*/"=>"")] = getpkgjson(user_slash_repo)
end

In [None]:
[addghdeets!(table[i,1]) for i in 20:100] # triggers rate limit by the looks of things

In [None]:
#merge(
#    get(deps,"dependencies",Dict{String,Any}()),
#    get(deps,"devDependencies",Dict{String,Any}())
#)
# Need to deal with version strings via npm registry, github / git:// / gitlab?
# Probably lots of error checking
# https://blog.npmjs.org/post/164799520460/api-rate-limiting-rolling-out error code 429 is "please slow down"

## NPM crawler

### Todo:

- Error handling (we currently skip many errors)
- Weird version handling: URLs, stars, etc
    - Stars as versions: Julia has nice v"0.5.2" style version strings with gt / lt support, so we can just get all version numbers from NPM and just take the maximum 
    - URLs as versions: probably should use GitHub (and GitLab API) to get package.json, otherwise just give up
- Less silly names for things
- Get rid of debugging printlns
- Todo: there can be collisions between GitHub repo names and NPM packages (e.g, airbnb/javascript). Consider splitting their caches up.

In [None]:
function getnpmdeets(pkgname,version="")
    # NPM registry API: https://github.com/npm/registry/blob/master/docs/REGISTRY-API.md / https://registry.npmjs.org/
    println(pkgname,"-v",version)
    try
        r = HTTP.request(
            "GET",
            "https://registry.npmjs.org/$pkgname/$version",
            # /package/version - need to strip non-numeric / . characters from version?
            #Dict("User-Agent"=>"node-conservation-society");
            #verbose=3
            # need to deal with 404s etc
        )
        # Can use "maintainers" instead of GitHub contributors if we're feeling lazy
        return r.body |> String |> JSON.parse
    catch(e)
        return JSON.parse("{}")
    end
end

In [None]:
# Julian convention is to put the argument you modify first, but I wanted it to be optional
function addnpmdeets!(pkgname,version="",pkgs=toypackages)
    pkgs[pkgname] = getnpmdeets(pkgname,version)
end

In [None]:
ver2str(v) = string(v.major, ".", v.minor, ".", v.patch)
# need to deal with * version string : (
# can probs just do maximum(VersionNumber.(pkgjson[thething]["versions"] |> keys))
cleanver(x) = begin 
    try
        #println(x);
        return ver2str(VersionNumber(replace(x,r"[^0-9.]"=>"")));
    catch(e)
        return ""
    end
end

In [None]:
# Pseudo-code for (meta)digraph building:
# big dict of package name -> Dict(deps,devDeps,etc)
# g = lg.MetaGraph(lg.DiGraph())
# for each root package (popular starred one)
    # if it doesn't exist, make a node
    # for each dep
        # make a node if one doesn't exist
            # if it didn't exist, for each dep
                # make a node etc. etc.
                # link to that node
        # link to that node

# 
# addnodeanddeps!(g,pkgname) = begin
    # search metagraph for thisnode
    # if it doesn't exist, addedge(thisnode,addnodeanddeps!(g,pkgname))
    # return thisnode
# end

In [None]:
function addnodeanddeps!(g, pkgname, version="", pkgs=toypackages)
    node = 0
    # if pkgname not in pkgs, look it up on NPM, add to pkgs
   if !(pkgname in keys(pkgs))
       addnpmdeets!(pkgname,version,pkgs)
   end
    try
        node = g[pkgname,:name]
    catch(e) # if node not in graph already
        lg.add_vertex!(g)
        node = lg.nv(g)
        mg.set_prop!(G,node,:name,pkgname)
        # we perhaps want the nested get to throw an error
        for (depname, depversion) in get(get(toypackages,pkgname,Dict()),"devDependencies",Dict{String,Any}())
            depnode = addnodeanddeps!(G,depname,cleanver(depversion))
            lg.add_edge!(G,node,depnode)
        end
        for (depname, depversion) in get(get(toypackages,pkgname,Dict()),"dependencies",Dict{String,Any}())
            depnode = addnodeanddeps!(G,depname,cleanver(depversion))
            lg.add_edge!(G,node,depnode)
        end
    end
    return node
end

In [None]:
import ProgressMeter

In [None]:
G = mg.MetaDiGraph()
mg.set_indexing_prop!(G,:name)
addnodeanddeps!(G,"vscode")
#ProgressMeter.@showprogress [addnodeanddeps!(G,table[i,1]) for i in 1:20]
G

In [None]:
[table[i,1] for i in 1:20]

In [None]:
G

In [None]:
#GraphPlot.gplot(
#    G,
#   #nodelabel=(
#   #    mg.get_prop(G,i,:name) for i in 1:lg.nv(G)
#   #),
#)

In [None]:
(mg.get_prop(G,i,:name) for i in 1:lg.nv(G))

In [None]:
toypackages["yes"] = Dict{String,Any}()

In [None]:
"root" in keys(toypackages)

In [None]:
# plan to deal with * version number: ask npm about all versions, convert all versions to v"", then just max. job done.

In [None]:
map(v->(mg.get_prop(G,v[1],:name),v[1],v[2]),sort(collect(enumerate(lg.pagerank(G))),by=x->x[2]))

In [None]:
map(v->(mg.get_prop(G,v[1],:name),v[1],v[2]),sort(collect(enumerate(lg.Parallel.betweenness_centrality(G))),by=x->x[2]))

In [None]:
toypackages["liftoff"]["description"]
# Liftoff is already a pretty good candidate.
# Very little activity since July 2016. 500 projects use it.

In [None]:
Plots.histogram(lg.pagerank(G);yscale=:log10)

In [None]:
lg.pagerank(G)[414]

In [None]:
import JLD2, FileIO

In [None]:
[length(lg.all_neighbors(G,i)) for i in 1:lg.nv(G)][414]

In [None]:
map(v->(mg.get_prop(G,v[1],:name),v[1],v[2]),sort([(i,length(lg.inneighbors(G,i))) for i in 1:lg.nv(G)],by=x->x[2]))

# Network analysis

- Dependency importance:
    - PageRank (look for algorithm with authority (i.e, stars))
        - Think about why/how this makes sense: presumably because there is a time lag - the further away you are from an important package, the longer it takes you to infect it.
    - Betweenness centrality
        - Ditto
    - Number of in-neighbours
    - Stars of reachable package with most stars
    - Sum of all stars of all reachable packages
        - These two ignore time-lag
        
# Other stuff

- Pretty easy to repeat the exercise for Julia packages.
    - https://libraries.io/api has a nice API that lets us get dependencies, dependents, other metadata for lots of packages and lots of languages. Would be really neat to compare the ecosystems.
        - Rate limit seems mean, though - only allowed 60 requests a minute.
- Need to start storing metadata etc.

- Could be fun to look at collaboration network and see whether we can predict star success...


In [None]:
#Threads.@threads for i in 1:5
#    println(i)
#end # aka - how to crash Julia

In [None]:
# Save package data 

open("pkg_cache.json.gz","w") do f
    write(f,TranscodingStreams.transcode(
            CodecZlib.GzipCompressor,Vector{UInt8}(JSON.json(toypackages,4))
        )
    )
end;