# RF5 GitHub Repo-Level Metrics

- num contributors
- num trusted_contributors
- num active_contributors_6_months
- num stars
- num trusted_stars
- num trust_weighted_stars
- num forks
- num trusted_forks
- num trust_weighed_forks
- age_of_repo
- license

In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd

In [2]:
# add GCP project and credentials here
PROJECT = 'opensource-observer'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'

# connect to OSO
client = bigquery.Client()

# Load and process the applications

In [3]:
applications = json.load(open("data/applications.json"))

In [4]:
projects_data = []

for app in applications:
    project = app.get('project', {})
    repos = project.get('repos', [])
    project_name = project.get('name')
    project_id = project.get('id')

    if repos:
        for repo in repos:
            repo_url = repo.get('url', None)
            
            # fix one edge case
            if repo_url.lower() == 'https://github.com/protocolguild/membership':
                repo_url = 'https://github.com/protocolguild/documentation'

            projects_data.append({
                'project_name': project_name,
                'project_id': project_id,
                'repo_url': repo_url,
                'repo_name': repo.get('name', None)
            })
    else:
        projects_data.append({
            'project_name': project_name,
            'project_id': project_id,
            'repo_url': None,
            'repo_name': None
        })

df_projects = pd.DataFrame(projects_data)

def extract_owner_and_repo(url):
    if url and isinstance(url, str):
        url = url.lower()
        if "github.com" in url:
            parts = url.split('/')
            if len(parts) >= 5:
                return f"{parts[3]}/{parts[4]}"
    return None

def clean_repo_url(owner_and_name):
    if owner_and_name:
        return f"https://github.com/{owner_and_name}"

df_projects['repo_owner_and_name'] = df_projects['repo_url'].apply(extract_owner_and_repo)
df_projects['clean_url'] = df_projects['repo_owner_and_name'].apply(clean_repo_url)

project_name_mappings = df_projects.set_index('project_id')['project_name'].to_dict()

df_projects

Unnamed: 0,project_name,project_id,repo_url,repo_name,repo_owner_and_name,clean_url
0,Stepan's test project,0xbdd994bf9b06072f6f8603591c8907ca5a09a21fa14d...,,,,
1,Jonas Project,0x83b46efce8ff1937a49883b323b22d3483d1843522f6...,https://github.com/JSeiferth/op-analytics,This is my repo,jseiferth/op-analytics,https://github.com/jseiferth/op-analytics
2,Test Project,0x52d53d44856f5a356053e55e3ad339d7207486b0210f...,,,,
3,MonstersOnBase,0x1480383a90f1ab95cfedee6ea130dca700e5236eceb7...,https://github.com/mali030303/MonstersOnBasee,,mali030303/monstersonbasee,https://github.com/mali030303/monstersonbasee
4,BASE BTC EARTH 🌍,0xcb0434aaf86a5dca68f96edefc22bf04a16bbfaa48b0...,https://github.com/mali030303/BASE-BTC-EARTH--,,mali030303/base-btc-earth--,https://github.com/mali030303/base-btc-earth--
...,...,...,...,...,...,...
198,NFT Starter,0xd8012752e512ed8c6a47e61293bbad8cd942c9d016de...,https://github.com/nonboring/nft-starter,NFT Starter,nonboring/nft-starter,https://github.com/nonboring/nft-starter
199,Hedgio Podcast,0xe88826f4dde8d4abaa647a68763a4826fbb446e977cc...,,,,
200,Blockchef | Blockchain; From Farm to Fork,0x679e661b67e85d937baace0bc56a6274dddfdffd8cce...,https://github.com/blockchef-io/OP-RPGF,OP-RPGF Repo,blockchef-io/op-rpgf,https://github.com/blockchef-io/op-rpgf
201,Rentality,0xf04c2c47624bc0191001f655e147986439541493ee06...,https://github.com/Rentality-xyz/Rentality-dem...,,rentality-xyz/rentality-demo-contracts,https://github.com/rentality-xyz/rentality-dem...


In [5]:
repo_urls = list(df_projects['clean_url'].dropna().unique())
len(repo_urls)

151

# Fetch data from OSO

In [6]:
# Get snapshot of repo metrics (taken 2024-09-16)

repo_urls_str = "'" + "','".join(repo_urls) + "'"
repos_query = f"""
    select
      abp.artifact_id,
      abp.artifact_namespace,
      abp.artifact_name,
      abp.artifact_url,
      abp.artifact_type,
      rm.is_fork,
      rm.fork_count,
      rm.star_count,
      rm.language,
      rm.license_spdx_id,
      abp.project_id as oso_project_id,
    from `{PROJECT}.oso.int_artifacts_in_ossd_by_project` as abp
    join `{PROJECT}.oso.int_repo_metrics_by_project` as rm
      on abp.artifact_id = rm.artifact_id
    where abp.artifact_url in ({repo_urls_str})
"""
repos_query_result = client.query(repos_query)
df_repos = repos_query_result.to_dataframe()
df_repos.tail(1)

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,artifact_url,artifact_type,is_fork,fork_count,star_count,language,license_spdx_id,oso_project_id
136,aXbDRz3QGK6qa0CcrBNLAMi1qlw_eVaB-vq06PnT0O4=,kroma-network,kroma,https://github.com/kroma-network/kroma,REPOSITORY,False,80,158,Go,NOASSERTION,F8npEwagURJOf7hOCr27eOcUjo9m51wa4KlOH0ZsO9c=


In [7]:
# identify any repos in apps that do not have data
print("Ignored repos:")
valid_repo_urls = []
for repo in repo_urls:
    if repo not in df_repos['artifact_url'].unique():
        print(repo)
    else:
        valid_repo_urls.append(repo)

print()        
print("Indexed repos:",len(valid_repo_urls))        

Ignored repos:
https://github.com/jseiferth/op-analytics
https://github.com/mali030303/monstersonbasee
https://github.com/mali030303/base-btc-earth--
https://github.com/mali030303/dragons-on-op-stack--
https://github.com/blockpilabs/aggregator
https://github.com/richardgreg/execution-specs-contribution
https://github.com/richardgreg/op-docs-improvements
https://github.com/zeus199803/8-bit-cats--
https://github.com/zeus199803/opstack-for-cats-dream-
https://github.com/users/zeus199803
https://github.com/blockchaindevsh/optimism
https://github.com/jsvisa/retro5
https://github.com/nonboring/nft-starter
https://github.com/blockchef-io/op-rpgf

Indexed repos: 137


In [8]:
len(sorted(valid_repo_urls))

137

In [9]:
repo_app_mapping = (
    df_projects[df_projects.clean_url.isin(valid_repo_urls)]
    [['clean_url', 'project_id']]
    .drop_duplicates()
    .set_index('clean_url')['project_id']
    .to_dict()
)
df_repos['application_id'] = df_repos['artifact_url'].map(repo_app_mapping)
df_repos['license_spdx_id'].replace({'NOASSERTION': 'Custom'}, inplace=True)
df_repos.tail(10)

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,artifact_url,artifact_type,is_fork,fork_count,star_count,language,license_spdx_id,oso_project_id,application_id
127,3jCFruW298HpaaxFgM7oXehQepEyqNA5FYxUG90VE6U=,libp2p,js-libp2p,https://github.com/libp2p/js-libp2p,REPOSITORY,False,438,2293,TypeScript,Custom,DIFwzDKrLDsrfbdzXMd0V-K852SAS7sR0loKq0bIlco=,0x5a7e7c7acb21521e99021d746740b368801cbfe53130...
128,jTjb4W8VNrbdytj2UndNOJZW2YbnLjDvJLOJB5HfW7w=,probe-lab,hermes,https://github.com/probe-lab/hermes,REPOSITORY,False,5,15,Go,Custom,N7NAEbsz1DwsTbATG73PWP0CeIk1uh2x4MLbXMaJkzs=,0x7504e494cb8d227193182e083128912173c14eaeecec...
129,RQjVPU9r1_204yjI-4JLaw7ntJBH58r37UYNOv3MN48=,status-im,nimbus-eth2,https://github.com/status-im/nimbus-eth2,REPOSITORY,False,223,519,Nim,Custom,TPWyJq_aZvIAi16Dn7k_r3FBdrMq_sQZIyAWGZGzHF8=,0xe346264e87202b47f1057eb0b0fcaa0ea7f83e14507c...
130,uYxN0IH7L8_gIMv06vFY4gGaXUntqedx8Yoml0S2TQU=,vyperlang,vyper,https://github.com/vyperlang/vyper,REPOSITORY,False,788,4839,Python,Custom,9FPohrEjU-h_5BTmL-HBlyLH7f9rLprx2zqBddNDtn0=,0x9ca1f7b0e0d10d3bd2619e51a54f2e4175e029c87a29...
131,K9mNvObyRSF085wsnLchYPEesFoyqjRMjL5q6p0OsRw=,kroma-network,tachyon,https://github.com/kroma-network/tachyon,REPOSITORY,False,227,7777,C++,MIT,F8npEwagURJOf7hOCr27eOcUjo9m51wa4KlOH0ZsO9c=,0x8c76c13d8d0e63a7de499d47b9da5a4495d1151c0b20...
132,CMQNxUSya5sOdelhRgeHXBcP_f8Dvv46jQCBBaXY134=,nethermindeth,sedge,https://github.com/nethermindeth/sedge,REPOSITORY,False,41,149,Go,Apache-2.0,gwH5b9-f9zIzeDdketoo-MiyF-XZONwn-rkLJzNnf-U=,0x8431c44e2f0903879f34134e68fba4c05833b02451cc...
133,l0eOSixCA_3ZFeg0dJhNNWW63p7JMWgvsSDvd9CX7z0=,cryptozombieshq,cryptozombie-lessons,https://github.com/cryptozombieshq/cryptozombi...,REPOSITORY,False,545,1057,TypeScript,Custom,9S2DoYN6WOL4sAAa7RwQqFeyQXNju78UfpLWAO5HiQU=,0xbd7efe4f47254e152bc1193b095dbad6711748577df2...
134,MEVv3odm6a8w83G5PB91ToGB_Om8A5WgqXJEb-aMx3E=,lexdao,lexcorpus,https://github.com/lexdao/lexcorpus,REPOSITORY,False,43,164,Solidity,GPL-3.0,k5ip2ng8cFp1NCCK9GmxeHKrGnciiE2JUd0uD8Fffb8=,0x193256f7753cf4350332ab5222990944b45d7ae280cd...
135,D94ciJAItLnioR-AZ4XPeOgMD3boW46nMQLaJslkTuM=,libp2p,jvm-libp2p,https://github.com/libp2p/jvm-libp2p,REPOSITORY,False,75,264,Kotlin,Apache-2.0,DIFwzDKrLDsrfbdzXMd0V-K852SAS7sR0loKq0bIlco=,0x0be3a0fa062180bdfbfdefa993b09acd9edcae93ba0d...
136,aXbDRz3QGK6qa0CcrBNLAMi1qlw_eVaB-vq06PnT0O4=,kroma-network,kroma,https://github.com/kroma-network/kroma,REPOSITORY,False,80,158,Go,Custom,F8npEwagURJOf7hOCr27eOcUjo9m51wa4KlOH0ZsO9c=,0x8c76c13d8d0e63a7de499d47b9da5a4495d1151c0b20...


In [10]:
artifact_app_mapping = df_repos.set_index('artifact_id')['application_id'].to_dict()
len(artifact_app_mapping)

137

# Fetch OSO event data from relevant repos

In [11]:
# Get all event data (cutoff date of 2024-08-01)

artifact_ids = list(artifact_app_mapping.keys())
artifact_ids_str = "'" + "','".join(artifact_ids) + "'"

CUTOFF = '2024-08-01'

events_query = f"""
    select
        time,
        event_type,
        from_artifact_name as user,
        from_artifact_id,
        to_artifact_id 
    from `{PROJECT}.oso.int_events`
    where
        to_artifact_id in ({artifact_ids_str})
        and time < '{CUTOFF}'
"""

# uncomment everything below if you want live data, otherwise uses local backup

# events_query_results = client.query(events_query)
# df_events = events_query_results.to_dataframe()

# # add application ids
# df_events['application_id'] = df_events['to_artifact_id'].map(artifact_app_mapping)

# # filter bot activity
# bot_list = ['codecov-commenter', 'claassistant', 'googlebot', 'omahs']
# github_users = list(df_events['user'].unique())
# bots = [x for x in github_users if '[bot]' in x or x in bot_list]
# df_events = df_events[df_events['user'].isin(bots) == False]

# df_events.to_parquet("data/rf5_events.parquet")

In [12]:
df_events = pd.read_parquet("data/rf5_events.parquet")

df_events['bucket_day'] = pd.to_datetime(df_events['time'].dt.date)
df_events['amount'] = 1

df_events.tail(1)

Unnamed: 0,time,event_type,user,from_artifact_id,to_artifact_id,application_id,bucket_day,amount
1052682,2023-07-31 13:25:23+00:00,PULL_REQUEST_REVIEW_COMMENT,thomaseizinger,xOfgF7_wYw1J5fCCwpUuFs53BTw1iXb1wenhuspVXXM=,dxsMNRXWzfg8lMvq0M4bY-NZ5961glN0Q-X64anZ8BI=,0xdf1bb03d08808e2d789f5eac8462bdc560f1bb5b0877...,2023-07-31,1


# Derive some metrics

In [13]:
# identify the top N users from openrank

N = 420
users = pd.read_csv('data/openrank/users.csv')
top_users = users['peer'].iloc[:N].to_list()

In [14]:
contributor_event_types = [
    'COMMIT_CODE',
    'PULL_REQUEST_OPENED',
    'PULL_REQUEST_REVIEW_COMMENT',
    'ISSUE_OPENED'
]

In [15]:
METRIC_INDEX = 'application_id'
metrics = []

metrics.append(
    df_repos
    .groupby(METRIC_INDEX)['artifact_url']
    .unique()
    .apply(lambda x: ', '.join(x))
    .rename('repo(s)')
)

metrics.append(
    df_events[df_events.event_type.isin(contributor_event_types)]
    .groupby(METRIC_INDEX)['from_artifact_id']
    .nunique()
    .rename('num_contributors')
)

metrics.append(
    df_events[
        (df_events.event_type.isin(contributor_event_types))
        & (df_events.user.isin(top_users))
    ]
    .groupby(METRIC_INDEX)['from_artifact_id']
    .nunique()
    .rename('num_trusted_contributors')
)

metrics.append(
    df_events[
        (df_events.event_type.isin(contributor_event_types))
        & (df_events['bucket_day'] >= pd.to_datetime('2024-02-01'))
    ]
    .groupby(METRIC_INDEX)['from_artifact_id']
    .nunique()
    .rename('num_contributors_last_6_months')
)

metrics.append(
    df_repos
    .groupby(METRIC_INDEX)['star_count']
    .sum()
    .rename('num_stars')
)

metrics.append(
    df_events[
        (df_events.event_type == 'STARRED')
        & (df_events.user.isin(top_users))
    ]
    .groupby(METRIC_INDEX)['from_artifact_id']
    .nunique()
    .rename('num_trusted_stars')
)

metrics.append(
    df_repos
    .groupby(METRIC_INDEX)['fork_count']
    .sum()
    .rename('num_forks')
)

metrics.append(
    df_events[
        (df_events.event_type == 'FORKED')
        & (df_events.user.isin(top_users))
    ]
    .groupby(METRIC_INDEX)['from_artifact_id']
    .nunique()
    .rename('num_trusted_forks')
)

metrics.append(
    df_events
    .groupby(METRIC_INDEX)['bucket_day']
    .min()
    .apply(lambda x: (2024. + 8/12.) - (x.year + x.month/12.))
    .rename('age_of_project_years')
)

metrics.append(
    df_repos
    .groupby(METRIC_INDEX)['license_spdx_id']
    .unique()
    .apply(lambda x: ', '.join([repo for repo in x if len(repo)]))
    .rename('license(s)')
)

df_metrics = pd.concat(metrics, axis=1)
df_metrics['project_name'] = df_metrics.index.map(project_name_mappings)

df_metrics

Unnamed: 0_level_0,repo(s),num_contributors,num_trusted_contributors,num_contributors_last_6_months,num_stars,num_trusted_stars,num_forks,num_trusted_forks,age_of_project_years,license(s),project_name
application_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0x04b1cd5a7c59117474ce414b309fa48e985bdaab4b0dab72045f74d04ebd8cff,https://github.com/sigp/lighthouse,585.0,44.0,101.0,2882,44.0,724,41.0,6.083333,Apache-2.0,Lighthouse
0x09875cfe708f0c3acaca3d8af6b1ca2bd45b43c4a268a55c5d2b50ec9bdeef75,https://github.com/ethereum-optimism/developers,44.0,6.0,31.0,67,4.0,43,3.0,0.833333,CC0-1.0,TechNERDs Program
0x09b95c7697625da4915338750c5f78446817a3634cb38bc9155e26bbbc0c87f1,https://github.com/smartcontracts/simple-optim...,67.0,6.0,16.0,323,22.0,112,8.0,2.250000,MIT,Maintaining simple-optimism-node
0x0be3a0fa062180bdfbfdefa993b09acd9edcae93ba0d8d5829dd01c138268f40,https://github.com/libp2p/jvm-libp2p,49.0,10.0,4.0,264,4.0,75,9.0,5.250000,Apache-2.0,jvm-libp2p
0x10e9fc6dd7d01e09bd9440d507846432333a06f779287b29199010e2f50579cd,https://github.com/wakeuplabs/rfg1-optimism,4.0,,3.0,2,,1,,0.666667,MIT,Token Historical Balance
...,...,...,...,...,...,...,...,...,...,...,...
0xf6ab5ce1bb1cade3c885b0668addb79ea8997b6dd2b106e95ef1ada65d6f10b7,https://github.com/optimism-java/dispute-explo...,3.0,,3.0,0,,0,,0.166667,MIT,superproof
0xf839a585342327848d4541a6fcc315404e879537a60a1370f2cd45a94283a1ba,https://github.com/dappnode/dappnode,137.0,5.0,11.0,581,8.0,99,1.0,6.500000,GPL-3.0,Dappnode
0xfad78fad680d407f81e7fd46632b4b4936676d1776a5cbf02694b94698495746,"https://github.com/succinctlabs/sp1, https://g...",111.0,5.0,111.0,982,28.0,278,4.0,0.500000,Apache-2.0,OP Succinct: Full ZK Validity Proving of OP St...
0xfef0eaf3c745a7175550a75ecfc0056d0a3276eeb0ce8fbc45dd7254411985ec,https://github.com/rdubois-crypto/freshcryptol...,14.0,3.0,6.0,161,8.0,24,2.0,1.416667,MIT,Smoo.th Cryptolib


In [16]:
df_metrics.to_csv('data/rf5_applicant_github_metrics.csv')