# Simulation of RF5 GitHub Repo-Level Metrics

Ideas:
1. stars_total
2. stars_from_top_devs
3. forks_total
4. forks_from_top_devs
5. age_of_project
6. avg_fulltime_developers
7. unique_contributors_last_6_months

In [1]:
from google.cloud import bigquery
import os
import pandas as pd

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'
client = bigquery.Client()

## Queries to OSO

In [2]:
# get projects in RF3 but not in RF4

projects_query = """
    select
      project_id,
      project_name
    from `opensource-observer.oso.projects_by_collection_v1`
    where
        collection_name = 'op-rpgf3'
        and project_id not in (
            select project_id
            from `opensource-observer.oso.projects_by_collection_v1`
            where collection_name = 'op-retrofunding-4'
        )
"""
projects_query_results = client.query(projects_query)

# store as a dict
project_ids_names = projects_query_results.to_dataframe().set_index('project_id')['project_name'].to_dict()
len(project_ids_names)

264

In [3]:
# warning: this query is expensive! use the parquet

events_query = f"""
    select
      events.project_id,
      events.bucket_day,      
      events.from_artifact_id as user_id,
      users.artifact_name as user,
      events.to_artifact_id as repo_id,
      repos.artifact_namespace as repo_owner,
      repos.artifact_name as repo_name,
      events.event_type,      
      events.amount
    from `opensource-observer.oso.int_events_daily_to_project` as events
    join `opensource-observer.oso.artifacts_v1` as users
      on events.from_artifact_id = users.artifact_id
    join `opensource-observer.oso.artifacts_v1` as repos
      on events.to_artifact_id = repos.artifact_id
    where
        events.event_source = 'GITHUB'
        and events.project_id in (
            select project_id
            from ({projects_query})
        )
"""

# events_query_results = client.query(events_query)
# df_events = events_query_results.to_dataframe()
# df_events.to_parquet("data/rf5_events.parquet")

df_events = pd.read_parquet("data/rf5_events.parquet")
df_events['project_name'] = df_events['project_id'].map(project_ids_names)
df_events['artifact'] = df_events.apply(lambda x: '/'.join([x['repo_owner'], x['repo_name']]), axis=1)
artifact_list = df_events['artifact'].unique()
df_events.tail(1)

Unnamed: 0,project_id,bucket_day,user_id,user,repo_id,repo_owner,repo_name,event_type,amount,project_name,artifact
2186651,y9t7a2RCN_Cxpi-g-8Qmb5txYxaC0nARzZwmwSWinuI=,2022-03-09 00:00:00+00:00,ZQ53uNOIn9siBHNyFN836ij1XJf3R5I045bQGbTpZ_w=,emazurek,aQosZmfg_aQpa6Hpb5kbbERkVVpkIsfmCqvzz-rsgvE=,foundry-rs,book,PULL_REQUEST_OPENED,1.0,foundry,foundry-rs/book


In [4]:
# get code metrics for artifacts

metrics_query = f"""
    select *
    from `opensource-observer.oso.code_metrics_by_artifact_v0`
"""

metrics_query_results = client.query(metrics_query)
df_metrics = metrics_query_results.to_dataframe()
df_metrics['artifact'] = df_metrics.apply(lambda x: '/'.join([x['artifact_namespace'], x['artifact_name']]), axis=1)
df_metrics = df_metrics[df_metrics['artifact'].isin(artifact_list)]
df_metrics.tail(1)

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,event_source,first_commit_date,last_commit_date,star_count,fork_count,contributor_count,contributor_count_6_months,new_contributor_count_6_months,fulltime_developer_average_6_months,active_developer_count_6_months,commit_count_6_months,opened_pull_request_count_6_months,merged_pull_request_count_6_months,opened_issue_count_6_months,closed_issue_count_6_months,artifact
46479,oq9zx8kNuZh4oAXf-0JFXmvNiXO93qdTfSqX07ZlDeM=,web3,web3.js,GITHUB,2021-04-29 23:41:15+00:00,2024-08-06 12:43:24+00:00,19066,4894,2303.0,104.0,86.0,0.0,6.0,120.0,188.0,121.0,148.0,172.0,web3/web3.js


## Derive some metrics

In [5]:
# identify the top N users from openrank

N = 420
users = pd.read_csv('data/openrank/users.csv')
top_users = users['peer'].iloc[:N].to_list()

In [6]:
# find the top ranked repo (by openrank) for each project
repos = pd.read_csv('data/openrank/repos.csv')
repos['rank'] = repos['a=0.5'].rank(ascending=False)
repo_rank = repos.set_index('peer')['rank'].to_dict()

df_repos = df_events[['repo_owner', 'repo_name']].drop_duplicates()
df_repos['artifact'] = df_repos.apply(lambda x: '/'.join([x['repo_owner'], x['repo_name']]), axis=1)
df_repos['repo_rank'] = df_repos['artifact'].apply(lambda x: repo_rank.get(x))

top_repo_rank = (
    df_repos
    .sort_values(['repo_owner', 'repo_rank'])
    .drop_duplicates('repo_owner', keep='first')
    .set_index('artifact')['repo_rank']
    .dropna()
    .sort_values()
)
top_repo_rank.head()

artifact
testinprod-io/op-erigon     2.0
paradigmxyz/reth           11.0
foundry-rs/foundry         18.0
ethereum/go-ethereum       23.0
optimism-java/hildr        28.0
Name: repo_rank, dtype: float64

In [7]:
# derive the other metrics from the OSO event data

def metric_factory(metric, user_filter=None):
    metric_name = metric.lower()
    dff = df_events.query('event_type == @metric')
    if user_filter:
        metric_name += '_by_top_devs'
        dff = dff.query('user in @user_filter')
    series = (
        dff
        .groupby('artifact')['amount']
        .sum()
        .sort_values(ascending=False)
    )
    series.name = metric_name
    return series

metric_factory('STARRED', user_filter=top_users).head()

artifact
ethereum/go-ethereum    218.0
ethereum/eips           126.0
ethereum/solidity       114.0
foundry-rs/foundry       95.0
paradigmxyz/reth         76.0
Name: starred_by_top_devs, dtype: float64

In [8]:
# get age of project

TODAY = 2024. + 8/12
age_of_project = df_events.groupby('artifact')['bucket_day'].min().apply(lambda x: TODAY - (x.year + x.month/12))
age_of_project.name = 'age_of_project_years'

In [9]:
df_metrics

Unnamed: 0,artifact_id,artifact_namespace,artifact_name,event_source,first_commit_date,last_commit_date,star_count,fork_count,contributor_count,contributor_count_6_months,new_contributor_count_6_months,fulltime_developer_average_6_months,active_developer_count_6_months,commit_count_6_months,opened_pull_request_count_6_months,merged_pull_request_count_6_months,opened_issue_count_6_months,closed_issue_count_6_months,artifact
11,P1MEbreq9MK5pnr7GehyoJKOwhHwM6WvxXwFoNSbAgE=,otterscan,go-otterscan,GITHUB,NaT,NaT,1,0,,,,,,,,,,,otterscan/go-otterscan
34,-n3iNAgT2earAsOtMvgOkRUnPRd1B-pZa3e6RMhSIfg=,boardroom-inc,subgraph-moloch,GITHUB,NaT,NaT,1,0,,,,,,,,,,,boardroom-inc/subgraph-moloch
41,FAJnhumGCXneOf6dA1IuJnWjl_hxCnXcyVjp2tXkUZM=,gelatodigital,range-orders-lib,GITHUB,NaT,NaT,0,0,,,,,,,,,,,gelatodigital/range-orders-lib
151,iQwCOYv0GooMWo1yP3ZZopX11qkG3ePVFXdkM5O0FEs=,boardroom-inc,snapshot-hub,GITHUB,NaT,NaT,1,0,,,,,,,,,,,boardroom-inc/snapshot-hub
169,jE83t1iRt1r-fEPRafbxaLfPhtc8jM-b4TEjPZZoNm4=,filosofiacodigo,web3example,GITHUB,NaT,NaT,1,1,,,,,,,,,,,filosofiacodigo/web3example
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46461,EZzhO7pzHe-oighbVcjMPHZt9nCPOLWMMBiJUAkBEZg=,crustio,free-storage,GITHUB,2021-09-10 06:46:05+00:00,2021-12-24 06:36:03+00:00,37,2,286.0,21.0,21.0,0.000000,0.0,0.0,0.0,0.0,28.0,6.0,crustio/free-storage
46463,sUZ875Tw1HOIDmwjwWXLbX23sfSXhE2mVsO7yGOQwC0=,blocknative,web3-onboard,GITHUB,2019-10-03 19:07:17+00:00,2024-08-07 19:13:25+00:00,820,483,446.0,60.0,44.0,0.203297,2.0,84.0,142.0,112.0,41.0,20.0,blocknative/web3-onboard
46464,y7E_pBqUub6t4UDmk2MmeO_4UD8hAJLieYhD3ls5sbk=,ipfs,ipfs-desktop,GITHUB,2021-04-30 14:21:48+00:00,2024-06-13 22:31:11+00:00,5896,853,796.0,53.0,44.0,0.000000,2.0,31.0,18.0,17.0,68.0,59.0,ipfs/ipfs-desktop
46469,OJsaun1W8kBAAICsz1IDjnt_cjDtOzR0vHuhHJg4zpg=,ethereum,remix-ide,GITHUB,2016-05-17 23:26:40+00:00,2024-07-21 10:40:31+00:00,2243,1104,495.0,10.0,7.0,0.000000,3.0,19.0,21.0,18.0,0.0,0.0,ethereum/remix-ide


## Consolidate and export the data

In [10]:
metrics = [
    top_repo_rank,
    metric_factory('STARRED'),
    metric_factory('STARRED', user_filter=top_users),
    metric_factory('FORKED'),
    metric_factory('FORKED', user_filter=top_users),
    age_of_project
]

df = (
    df_metrics
    .set_index('artifact')
    [['star_count', 'fork_count', 'fulltime_developer_average_6_months', 'new_contributor_count_6_months']]
    .join(pd.concat(metrics, axis=1))
)
df = df[df['repo_rank'].isna() == False]
df.fillna(0, inplace=True)
df.sort_values(by='repo_rank', inplace=True)
df.head(20)

Unnamed: 0_level_0,star_count,fork_count,fulltime_developer_average_6_months,new_contributor_count_6_months,repo_rank,starred,starred_by_top_devs,forked,forked_by_top_devs,age_of_project_years
artifact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
testinprod-io/op-erigon,85,14,0.038462,15.0,2.0,104.0,23.0,17.0,2.0,1.5
paradigmxyz/reth,3634,966,4.961538,207.0,11.0,4182.0,76.0,1336.0,83.0,1.666667
foundry-rs/foundry,7969,1619,2.302198,327.0,18.0,9904.0,95.0,2104.0,94.0,2.916667
ethereum/go-ethereum,47234,20139,1.631868,338.0,23.0,118696.0,218.0,52818.0,346.0,9.583333
optimism-java/hildr,39,11,0.0,2.0,28.0,49.0,13.0,10.0,3.0,1.25
bluealloy/revm,1521,492,1.0,54.0,29.0,1818.0,55.0,585.0,48.0,2.916667
alloy-rs/alloy,492,166,2.43956,125.0,32.0,578.0,18.0,207.0,26.0,0.75
statechannels/go-nitro,37,20,0.0,1.0,34.0,50.0,14.0,17.0,1.0,2.833333
sigp/lighthouse,2854,713,0.0,59.0,47.0,3639.0,57.0,910.0,50.0,6.083333
nicenode/nice-node,181,27,0.368132,5.0,52.0,211.0,8.0,34.0,1.0,2.333333


In [11]:
df.to_csv("data/rf5_repo_metrics_simulation.csv")