In [1]:
from google.cloud import bigquery
import os
import pandas as pd
import plotly.express as px
import sys

sys.path.append('../../../visualizations/')
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../gcp_credentials.json'

from venn_diagram import venn3_diagram

client = bigquery.Client()

# Get projects by cohort

In [13]:
query = """
    select
      project_id,
      project_name,
      collection_name
    from `opensource-observer.oso.projects_by_collection_v1`
"""
projects_by_collection = client.query(query).to_dataframe()

In [14]:
collections = {}
for _,row in projects_by_collection.iterrows():
    c = row['collection_name'] 
    if c not in collections:
        collections.update({c:[]})
    collections[c].append(row['project_id'])
collections = {x:set(y) for x,y in collections.items()}

In [15]:
projects = (
    projects_by_collection[['project_id', 'project_name']]
    .drop_duplicates()
    .set_index('project_id')['project_name']
    .to_dict()
)
project_list = list(projects.keys())
project_list_str = "'" + "','".join(project_list) + "'"
len(project_list)

1507

In [16]:
for pid,pname in projects.items():
    if pname in ['layer3xyz', 'multicaller-vectorized']:
        collections['op-rpgf3'].add(pid)

# Get project events

In [17]:
query = f"""
    with metrics as (
      select * from `oso.int_code_metric__bus_factor`
      union all
      select * from `oso.int_code_metric__commits_prs_issues`
      union all
      select * from `oso.int_code_metric__active_developers`
      union all
      select * from `oso.int_code_metric__contributors`
      union all
      select * from `oso.int_code_metric__new_contributors`
      union all
      select * from `oso.int_code_metric__fulltime_developers_average`
    )

    select * from metrics
"""
#metrics = client.query(query)

In [18]:
metrics = pd.read_csv('data/code_metrics_by_project.csv')
metrics.tail(1)

Unnamed: 0,project_id,event_source,time_interval,metric,amount
62151,2Ml1RLRnwPl8cfHkrEmKa4eBEsV5eJZUEtW0LsU9bqM=,GITHUB,6 MONTHS,issue_closed_count,1.0


# Analyze cohort performance

In [19]:
df = metrics.copy()
df['project_name'] = df['project_id'].map(projects)
df['in_rf4'] = df['project_id'].apply(lambda x: x in collections['op-retrofunding-4'])

df.head()

Unnamed: 0,project_id,event_source,time_interval,metric,amount,project_name,in_rf4
0,KEEBnsnsYBPgW5YvPDLHWm08v6YnPGXVnSJIxblxT6M=,GITHUB,ALL,bus_factor,1.0,pickle-finance,False
1,Q3efMLCb0mVeIGbjTSD4n0pQQWBAjUPMzndBrrDS2Q0=,GITHUB,ALL,bus_factor,1.0,tickerdao,False
2,h4xIFWYVPRWL8fHuVz0qy5biZxNBtDwxihPsLN8Hxes=,GITHUB,ALL,bus_factor,1.0,rentality-xyz,False
3,EMZ0Oshl_8gwlIC9MX2pUAAXqaR5ju8W8vmgL7PewFs=,GITHUB,ALL,bus_factor,1.0,vmex-finance,False
4,JGCJIFl8QtH4BDZmXHDbQwrkyPgfvx3rdwzJr77X0ZU=,GITHUB,ALL,bus_factor,1.0,,False


In [23]:
def assign_cohort(project_id):
    round_list = []
    for c,lst in collections.items():
        if c not in ('op-rpgf2', 'op-rpgf3', 'op-retrofunding-4'):
            continue
        if project_id in lst:
            rnd = c[-1]
            round_list.append(rnd)
    if round_list:
        return 'RF'+'/'.join(sorted(round_list))
    return None

df['cohort'] = df['project_id'].apply(assign_cohort)
df['cohort_label'] = df['cohort'].map({
    'RF4': 'First time participants in Retro Funding',
    'RF3/4': 'Returning participants',
    'RF2/3/4': 'Returning participants',
    'RF2/4': 'Returning participants',    
    'RF3': 'Past RFs only',
    'RF2/3': 'Past RFs only',
    'RF2': 'Past RFs only'
})

df.head(1)

Unnamed: 0,project_id,event_source,time_interval,metric,amount,project_name,in_rf4,cohort,cohort_label
0,KEEBnsnsYBPgW5YvPDLHWm08v6YnPGXVnSJIxblxT6M=,GITHUB,ALL,bus_factor,1.0,pickle-finance,False,,


In [44]:
df['metric'].unique()

array(['bus_factor', 'issue_closed_count', 'issue_opened_count',
       'commit_code_count', 'pull_request_opened_count',
       'pull_request_merged_count', 'active_developer_count',
       'new_contributor_count', 'contributor_count',
       'fulltime_developer_average'], dtype=object)

In [78]:
#df.groupby(['cohort_label']

INTERVALS = ['1 YEAR', '6 MONTHS']
#COLLECTION = 'protocol-labs-network'
COLLECTION = 'op-rpgf3'

#m = 'new_contributor_count'
#m = 'pull_request_merged_count'
METRIC = 'active_developer_count'

dff = 
dfm = (
 df[
     (df['project_id'].isin(collections[COLLECTION])) 
     & (df['time_interval'].isin(INTERVALS))
     & (df['metric']==METRIC)
 ]
 .query("metric == @m")
 .pivot_table(
     index='project_name',
     columns='time_interval', 
     values='amount', 
     aggfunc='sum',
     fill_value=0     
 )
.rename(
    columns={
        '1 YEAR': 'pre-RF',
        '6 MONTHS': 'post-RF'
        }
    )
.reset_index()       
)

dfm['pre-RF'] = dfm['pre-RF']/2
dfm['delta'] = dfm['post-RF'] / dfm['pre-RF']

print(dfm['post-RF'].sum() / dfm['pre-RF'].sum())

fig = px.scatter(
    data_frame=dfm,
    x='pre-RF',
    y='post-RF',
    labels='project_name',
    color='delta',
    color_continuous_scale='RdBu_r',
    log_x=True,
    log_y=True,   
    trendline='ols',
    custom_data=['project_name']

)
fig.update_layout(
    plot_bgcolor='white',
    paper_bgcolor='white')
fig.update_traces(
    line=dict(width=2, color='black'),
    hovertemplate='<b>Project Name:</b> %{customdata[0]}<br>' +
              '<b>Pre-RF:</b> %{x}<br>' +
              '<b>Post-RF:</b> %{y}<br>' +
              '<b>Delta:</b> %{marker.color}<extra></extra>'
)

SyntaxError: invalid syntax (1172328868.py, line 11)

In [69]:
? px.scatter

In [24]:
fig = px.treemap(
    data_frame=df[
        (df['event_type'] == 'Gas Fees (ETH)')
         & (df['time_interval'] == 'RF4 window')
         & (df['in_rf4']==True)
    ].dropna(),
    path=['cohort_label', 'project_name'],
    values='amount',
    color_discrete_sequence=['#FF0420', '#DDD'],
    
)
fig.data[0].textinfo = 'label+text+value'
fig

In [14]:
txns = (
    df[df['event_type'] == 'Transactions (M)']
    .groupby(['bucket_month','cohort_label'])
    ['amount'].sum()
    .reset_index()
)
txns.head(1)

Unnamed: 0,bucket_month,cohort_label,amount
0,2023-02-01 00:00:00+00:00,Multiple RFs,0.657073


In [15]:
fig = px.area(
    data_frame=txns,
    x='bucket_month',
    y='amount',
    color='cohort_label',
    color_discrete_sequence=['purple', 'cyan', '#FF0420'],
)
fig.update_layout(
    xaxis=dict(showgrid=True, title=''),
    yaxis=dict(showgrid=True, title='Superchain Transactions (M)'),
    legend=dict(title='Cohort'),
    plot_bgcolor='white',
    paper_bgcolor='white'
)
fig.update_traces(line=dict(width=1))

In [16]:
(
    df[
        (df['time_interval'].isin(['RF4 window', 'RF3 window']))
        & (df['project_id'].isin(onchain_projects))
      ]
    .pivot_table(
        index='cohort_label',
        columns=['event_type', 'time_interval'],
        values='amount',
        aggfunc='sum')
    .applymap(round)
    .reindex(['RF4 only', 'Multiple RFs', 'Past RFs only'])
)

event_type,Code Commits (K),Code Commits (K),Gas Fees (ETH),Gas Fees (ETH),Transactions (M),Transactions (M)
time_interval,RF3 window,RF4 window,RF3 window,RF4 window,RF3 window,RF4 window
cohort_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
RF4 only,58,59,701,1230,33,52
Multiple RFs,47,49,355,1137,12,31
Past RFs only,24,21,119,109,10,4


In [17]:
pvt = (
    df[
        (df['time_interval'].isin(['RF4 window', 'RF3 window']))
        & (df['project_id'].isin(onchain_projects))
      ]
    .pivot_table(
        index=['cohort_label', 'project_name'],
        columns=['event_type', 'time_interval'],
        values='amount',
        aggfunc='sum', 
        fill_value=0
    )
)

In [18]:
pvt

Unnamed: 0_level_0,event_type,Code Commits (K),Code Commits (K),Gas Fees (ETH),Gas Fees (ETH),Transactions (M),Transactions (M)
Unnamed: 0_level_1,time_interval,RF3 window,RF4 window,RF3 window,RF4 window,RF3 window,RF4 window
cohort_label,project_name,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Multiple RFs,0x-splits,0.399,0.296,0.006573,0.197686,0.000766,0.011147
Multiple RFs,across,0.796,1.125,3.063905,26.091997,0.124029,1.036695
Multiple RFs,aerodrome-finance,0.001,0.041,7.888162,118.941636,0.302434,3.958043
Multiple RFs,buidlguidl,0.709,1.483,0.000071,0.407910,0.000027,0.024952
Multiple RFs,coordinape,0.485,0.553,0.432166,0.245044,0.042617,0.038769
...,...,...,...,...,...,...,...
RF4 only,zerion,0.034,0.185,0.571975,3.962016,0.058307,0.121741
RF4 only,zerius-io,0.000,0.030,0.000000,1.110249,0.000000,0.139076
RF4 only,zkbob,0.612,0.136,0.224327,0.377364,0.006254,0.018175
RF4 only,zkp2p,0.239,0.559,0.000000,0.031382,0.000000,0.001761
