# BuidlBox Analysis

This notebook looks at historic hackathon submissions and analyzes the following:

- does the project have a valid GitHub namespace (`owner/repo` or `owner`) that can be extracted?
- is the project already on Open Source Observer?
- if not on OSO:
  - is the project's owner a User or Organization?
  - if a User:
      - is the hackathon repo still active?
      - has the user contributed to other projects on OSO?
  - if an Organization:
      - is the hackathon repo still active?
      - are there other active repos in its namespace?

In [1]:
from google.cloud import bigquery
import json
import os
import pandas as pd
import sys
import time
from urllib.parse import urlparse

sys.path.append(os.path.abspath("../../scripts/"))
from github import validate_github_artifact, get_owner_type

# Query latest OSO project data

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../gcp_credentials.json'
client = bigquery.Client()

In [3]:
query = """
SELECT DISTINCT project_id, project_slug, artifact_name 
FROM `opensource-observer.oso.artifacts_by_project`
WHERE artifact_namespace = 'GITHUB'
"""

result = client.query(query)
project_artifacts = result.to_dataframe()
project_artifacts.tail(3)

Unnamed: 0,project_id,project_slug,artifact_name
34983,3Ff_uAlA9as0h6fCxaj-HlzTHlSoc8ObP7lPnUZhHjs=,understanding-optimism-codebase-joohhnnn,joohhnnn/understanding-optimism-codebase-cn
34984,3Ff_uAlA9as0h6fCxaj-HlzTHlSoc8ObP7lPnUZhHjs=,understanding-optimism-codebase-joohhnnn,joohhnnn/understanding-optimism-codebase
34985,YDP3svOj26WLxY0ieeWS_HhrnM8r8mNIWc14n1497Mo=,text-to-text-transfer-transformer-google-research,google-research/text-to-text-transfer-transformer


In [4]:
existing_repos = list(project_artifacts['artifact_name'].unique())
len(existing_repos)

34959

In [5]:
existing_orgs = list(set([x.split('/')[0] for x in existing_repos]))
len(existing_orgs)

1576

# Query latest OSO user data

In [6]:
query = """
SELECT 
  project_id,
  LOWER(from_name) AS user_name,
  MAX(time) AS date_last_activity
FROM `opensource-observer.oso.int_events_to_project`
WHERE to_namespace = 'GITHUB'
GROUP BY 1, 2
"""

result = client.query(query)
project_users = result.to_dataframe()
project_users.tail(3)

Unnamed: 0,project_id,user_name,date_last_activity
1806584,y9t7a2RCN_Cxpi-g-8Qmb5txYxaC0nARzZwmwSWinuI=,thomasheim11,2023-07-08 16:45:31+00:00
1806585,y9t7a2RCN_Cxpi-g-8Qmb5txYxaC0nARzZwmwSWinuI=,seoyong1,2023-04-23 11:35:25+00:00
1806586,y9t7a2RCN_Cxpi-g-8Qmb5txYxaC0nARzZwmwSWinuI=,ftfdfrancoeur65,2021-12-16 15:16:38+00:00


In [7]:
existing_users = project_users['user_name'].unique()
len(existing_users)

958869

# Process BuidlBox data

In [8]:
# helper functions for parsing GitHub urls

def parse_url(url):
    parsed_url = urlparse(url)
    path = parsed_url.path.strip("/").split("/")
    if len(path) > 2:
        return None
    namespace =  "/".join(path).lower()
    namespace = namespace.replace(".git", "")
    if len(namespace) < 2:
        return None
    return namespace

def splitter(x, pos):
    if isinstance(x, str):
        if '/' in x:
            return x.split('/')[pos]
        elif pos == 0:
            return x
    return None

In [9]:
bb = pd.read_csv('data/gitcoin+buidlbox_data.csv')

# parse GitHub urls
bb['github_namespace'] = bb['repo_url'].apply(parse_url)
bb['github_owner'] = bb['github_namespace'].apply(lambda x: splitter(x,0))
bb['github_repo'] = bb['github_namespace'].apply(lambda x: splitter(x,1))

# analyze repos
bb['is_valid_github_url'] = bb['repo_url'].str.contains("github.com/")
bb['is_valid_github_namespace'] = bb['github_namespace'].apply(lambda x: isinstance(x, str))
bb['does_owner_have_project_on_oso'] = bb['github_owner'].apply(lambda x: x in existing_orgs)
bb['is_repo_on_oso'] = bb['github_namespace'].apply(lambda x: x in existing_repos)

bb.head(3)

Unnamed: 0,project_name,repo_url,created_date,hackathon_name,bounty_title,org,gitcoin/buidlbox,github_namespace,github_owner,github_repo,is_valid_github_url,is_valid_github_namespace,does_owner_have_project_on_oso,is_repo_on_oso
0,aaveboard,https://github.com/merwane/aaveboard,2019-11-12 14:03:18+00,Web3 World,Gitcoin x Aave Hackathon Challenge : Build a m...,aave,gitcoin,merwane/aaveboard,merwane,aaveboard,True,True,False,False
1,Monitoring DApp,https://github.com/thesachinmittal/Aave-Monito...,2019-11-12 14:05:27+00,Web3 World,Gitcoin x Aave Hackathon Challenge : Build a m...,aave,gitcoin,thesachinmittal/aave-monitoring-dapp,thesachinmittal,aave-monitoring-dapp,True,True,False,False
2,Monitoring DApp,https://github.com/Ljtron/Aave-Monitoring-Dapp...,2019-11-12 14:06:42+00,Web3 World,Gitcoin x Aave Hackathon Challenge : Build a m...,aave,gitcoin,ljtron/aave-monitoring-dapp-for-data-lovers,ljtron,aave-monitoring-dapp-for-data-lovers,True,True,False,False


In [10]:
bb[bb.columns[-4:]].mean()

is_valid_github_url               0.993522
is_valid_github_namespace         0.775522
does_owner_have_project_on_oso    0.018464
is_repo_on_oso                    0.010797
dtype: float64

# Lookup whether project is User or Org

In [11]:
bb_owners = list(bb['github_owner'].dropna().unique())
len(bb_owners)

4780

In [12]:
def process_owners(list_of_owners, json_path, sleep=1):
    
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            results = json.load(f)
    else:
        results = {}
    
    for x in list_of_owners:
        if x in results:
            continue
        outcome = get_owner_type(x)
        results.update({x:outcome})        
        with open(json_path, 'w') as f:
            json.dump(results, f, indent=2)
        time.sleep(sleep)
        
    return results
    
github_checks_path = 'data/github_owners.json'    
github_owners = process_owners(
    bb_owners, 
    github_checks_path,
    sleep=0
)    

In [13]:
bb['github_owner_type'] = bb['github_owner'].map(github_owners)
bb.tail(3)

Unnamed: 0,project_name,repo_url,created_date,hackathon_name,bounty_title,org,gitcoin/buidlbox,github_namespace,github_owner,github_repo,is_valid_github_url,is_valid_github_namespace,does_owner_have_project_on_oso,is_repo_on_oso,github_owner_type
18520,PledgePost,https://github.com/PledgePost/v2Interface,2023-12-26 11:37:31.15+00,Allo on Arbitrum Hackathon,Best review and evaluation tool,ArbitrumDAO,buidlbox,pledgepost/v2interface,pledgepost,v2interface,True,True,False,False,Organization
18521,Monetizado,https://github.com/monetizado/Contracts,2024-03-08 18:24:23.109+00,ALT+D(enver),Virtual Vibes & Blockchain Bytes,buidlbox,buidlbox,monetizado/contracts,monetizado,contracts,True,True,False,False,Organization
18522,PledgePost,https://github.com/PledgePost/v2Interface,2023-12-26 11:37:31.15+00,Allo on Arbitrum Hackathon,Best Novel Funding Distribution Tool,ArbitrumDAO,buidlbox,pledgepost/v2interface,pledgepost,v2interface,True,True,False,False,Organization


# Check whether user is contributing to other projects on OSO

In [14]:
bb_users_on_oso = (
    set(bb[bb['github_owner_type'] == 'User']['github_owner'].unique())
    .intersection(set(existing_users))
)
len(bb_users_on_oso)

2079

In [15]:
project_users_on_bb = (
    project_users[project_users['user_name'].isin(bb_users_on_oso)]
    .groupby('user_name')
    .agg(
        {
            'date_last_activity': max,
            'project_id': 'nunique'
        }
    )
)
project_users_on_bb

Unnamed: 0_level_0,date_last_activity,project_id
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1
00x-dx,2024-01-30 16:20:29+00:00,2
0101coding,2022-08-08 18:37:03+00:00,1
0xandrew,2022-04-11 18:54:35+00:00,1
0xb0x,2022-06-07 19:41:50+00:00,2
0xbeedao,2022-03-28 23:58:38+00:00,4
...,...,...
zpencerguy,2023-07-14 23:01:06+00:00,8
zvezdochetag,2023-11-17 15:21:45+00:00,11
zyfrank,2021-06-18 00:40:07+00:00,16
zyra-zia,2020-06-20 19:10:27+00:00,1


In [16]:
bb_recent_users_on_oso = list(project_users_on_bb[project_users_on_bb['date_last_activity'] >= '2024-01-01'].index)
len(bb_recent_users_on_oso)

512

In [17]:
bb['is_user_visible_on_oso'] = bb['github_owner'].apply(lambda x: x in bb_users_on_oso)
bb['is_user_active_recently_on_oso'] = bb['github_owner'].apply(lambda x: x in bb_recent_users_on_oso)

bb.tail(3)

Unnamed: 0,project_name,repo_url,created_date,hackathon_name,bounty_title,org,gitcoin/buidlbox,github_namespace,github_owner,github_repo,is_valid_github_url,is_valid_github_namespace,does_owner_have_project_on_oso,is_repo_on_oso,github_owner_type,is_user_visible_on_oso,is_user_active_recently_on_oso
18520,PledgePost,https://github.com/PledgePost/v2Interface,2023-12-26 11:37:31.15+00,Allo on Arbitrum Hackathon,Best review and evaluation tool,ArbitrumDAO,buidlbox,pledgepost/v2interface,pledgepost,v2interface,True,True,False,False,Organization,False,False
18521,Monetizado,https://github.com/monetizado/Contracts,2024-03-08 18:24:23.109+00,ALT+D(enver),Virtual Vibes & Blockchain Bytes,buidlbox,buidlbox,monetizado/contracts,monetizado,contracts,True,True,False,False,Organization,False,False
18522,PledgePost,https://github.com/PledgePost/v2Interface,2023-12-26 11:37:31.15+00,Allo on Arbitrum Hackathon,Best Novel Funding Distribution Tool,ArbitrumDAO,buidlbox,pledgepost/v2interface,pledgepost,v2interface,True,True,False,False,Organization,False,False


# Get GitHub info for projects not on OSO

In [18]:
bb_potential_artifacts = list(
    bb[bb['does_owner_have_project_on_oso'] == False]
    ['github_namespace'].dropna().unique()
)
len(bb_potential_artifacts)

8843

In [19]:
def process_github(list_of_artifacts, json_path, sleep=1):
    
    if os.path.exists(json_path):
        with open(json_path, 'r') as f:
            results = json.load(f)
    else:
        results = {}
    
    for a in list_of_artifacts:
        if a in results:
            continue
        outcome = validate_github_artifact(a)
        results.update({a:outcome})        
        with open(json_path, 'w') as f:
            json.dump(results, f, indent=2)
        time.sleep(sleep)
        
    return results
    
github_checks_path = 'data/github_checks.json'    
github_results = process_github(
    bb_potential_artifacts, 
    github_checks_path,
    sleep=0
)    

In [20]:
bb['is_valid_new_project'] = bb['github_namespace'].apply(
    lambda x: github_results.get(x, {'Approved': False})['Approved'])

bb['project_status_notes'] = bb['github_namespace'].apply(
    lambda x: github_results.get(x, {'Reason': None})['Reason'])

In [21]:
bb['is_repo_still_available'] = (
    (bb['is_valid_github_namespace'] == True)
    & (bb['project_status_notes'] != 'Repo no longer available.')
)

In [22]:
test_cols = [c for c in bb.columns if c.split('_')[0] in ['is', 'does']]
bb[test_cols].mean()

is_valid_github_url               0.993522
is_valid_github_namespace         0.775522
does_owner_have_project_on_oso    0.018464
is_repo_on_oso                    0.010797
is_user_visible_on_oso            0.418777
is_user_active_recently_on_oso    0.115316
is_valid_new_project              0.007504
is_repo_still_available           0.574313
dtype: float64

# Export snapshot of data

In [23]:
bb.to_csv('data/oso_bb_analysis.csv')

# Print projects to add to OSO

In [24]:
potential_projects = sorted(bb[bb['is_valid_new_project'] == True]['github_namespace'].unique())
for p in potential_projects:
    print(f"- https://github.com/{p}")

- https://github.com/0xchijioke/pooldive
- https://github.com/agoraspacedao/guild.xyz
- https://github.com/akord-com
- https://github.com/alexandr-masl/web3-crowdfunding-on-allo-v2
- https://github.com/alexandre-abrioux/golem-node
- https://github.com/algorand/go-algorand-sdk
- https://github.com/alloliance
- https://github.com/anudit/convo
- https://github.com/bitcoin
- https://github.com/castvell/sperax-auditory-report
- https://github.com/ceramicnetwork/ceramic
- https://github.com/cheqlabs/denota
- https://github.com/cryptonerdcn/wasm-cairo
- https://github.com/dapotatoman/tez-it-all
- https://github.com/defitrack/defitrack-core
- https://github.com/dimensiondev/maskbook
- https://github.com/ebridgecrosschain
- https://github.com/elimu-ai
- https://github.com/ensuro/ensuro-binancehackathon
- https://github.com/farque65/evmaddresstracker
- https://github.com/flowstake
- https://github.com/g-r-ay/arbitrum-qa-dashboard
- https://github.com/harmony-one
- https://github.com/harmony-one/