# GORC-Dask
This notebook uses Dask to load GORC and perform a regex search for references to github repositories.

In [1]:
import json
import re
import pickle
import pandas as pd
from fuzzywuzzy import process
import numpy as np

In [2]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster()
client = Client(cluster)

In [3]:
import dask.bag as db

In [4]:
papers = db.read_text("/projects/bdata/gorc/papers/*.jsonl")

In [5]:
GITHUB_REGEX = r"\b(?P<github>github\.com\S*(?:(?![\.\/\"\]&\)\'<>[\\])\S)+)\b"
search_regex = re.compile(GITHUB_REGEX)
def fetch_github(paper):
    paper_loaded = json.loads(paper)
    metadata = paper_loaded["metadata"]
    metadata["paper_id"] = paper_loaded["paper_id"]
    
    result = re.findall(search_regex,paper)
    metadata["github_refs"] = result
    metadata["s2_pdf_hash"] = paper_loaded["s2_pdf_hash"]
    
    return metadata
            
github_links = papers.map(fetch_github)

In [None]:
result = github_links.compute()

In [None]:
gorc_github_refs = pd.DataFrame(github_links)

In [None]:
#Could do this better with regex
def get_repo_name(github_url):
    components = github_url.split("/")
    if len(components) > 2:
        repo_name = components[1] + "/" + components[2]
        return repo_name
    else:
        return np.nan
gorc_github_refs["repo_names"] = gorc_github_refs["github_refs"].map(lambda x: list(map(get_repo_name,x)))

In [None]:
gorc_github_repo_names = [item for sublist in gorc_github_refs["repo_names"] for item in sublist]

In [None]:
expanded_gorc_github_refs = [item for sublist in gorc_github_refs["github_refs"] for item in sublist]

In [None]:
def expand_df_list(df,lst_col):
    r = pd.DataFrame({
          col:np.repeat(df[col].values, df[lst_col].str.len())
          for col in df.columns.drop(lst_col)}
        ).assign(**{lst_col:np.concatenate(df[lst_col].values)})[df.columns]
    return r

gorc_github_refs = expand_df_list(gorc_github_refs, "repo_names")
gorc_github_refs = gorc_github_refs.rename(columns = {"repo_names":"repo_name"})

In [None]:
gorc_github_refs.to_csv("/projects/bdata/jupyter/gorc_github_refs.csv")

In [None]:
len(gorc_github_refs["repo_name"].drop_duplicates().dropna())

In [None]:
len(gorc_github_refs.drop_duplicates("paper_id"))

### Extract github repo metadata

In [None]:
notebook_info = pd.read_csv("/projects/bdata/jupyter/notebook_info/notebooks.csv")

In [None]:
notebook_info.head()

In [None]:
repos = pd.read_csv("/projects/bdata/jupyter/notebook_info/repositories.csv")
repos["repo_name"] = repos["owner_login"] + "/" + repos["repo_name"]

In [None]:
repos.head()

In [None]:
repos.to_csv("repos.csv",index = False)
gorc_github_refs.to_csv("gorc_github_refs.csv",index = False)

In [None]:
mutual_reference = set(repos["repo_name"]).intersection(set(gorc_github_repo_names))
len(mutual_reference)

In [None]:
gorc_notebooks = (gorc_github_refs
     .merge(repos, on = "repo_name", how ="inner")
     .merge(notebook_info, on = "repo_id", how = "inner")
     .drop_duplicates("nb_id"))["nb_id"]
gorc_notebooks = "nb_" + gorc_notebooks.astype(str)
gorc_notebooks.to_csv("academic_notebook_ids.csv")

In [None]:
len(gorc_github_refs)

In [None]:
result_df.to_csv("gorc_github_refs.csv")

## What's some quick analysis we can do with these data?

In [None]:
metadata = pd.read_json()

In [None]:
metadata.iloc[0]["items"]

In [None]:
repo_metadata_jsons = db.read_text("/projects/bdata/jupyter/repository_metadata/*.json")

In [None]:
def load_json_metadata(metadata_json):
    #Remove fields that are stored as API references
    metadata = json.loads(metadata_json)
    result = {}
    for k,v in metadata.items():
        if not "https://" in str(v):
            result[k] = v
    return result

repo_metadata = pd.DataFrame(repo_metadata_jsons.map(load_json_metadata).compute())

In [None]:
repo_metadata.to_csv("/projects/bdata/jupyter/cleaned_repo_metadata.csv", index=False)

# Extract Metadata From GORC 
TODO: Could probably optimize this into one DASK command

In [None]:
def fetch_paper_metadata(paper):
    paper_loaded = json.loads(paper)
    metadata = paper_loaded["metadata"]
    metadata["paper_id"] = paper_loaded["paper_id"]
    metadata["s2_pdf_hash"] = paper_loaded["s2_pdf_hash"]
    return metadata

In [None]:
gorc_metadata = papers.map(fetch_paper_metadata).compute()