In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv("data/aggregate_results_env.csv", index_col=0)
df1 = df1[['doi', 'file', 'success']]

In [3]:
print(str(len(df1))+' '+str(len(df1.doi.unique())))

7414 2085


In [4]:
df1.success.value_counts(dropna=False)

NaN    3719
0.0    2223
1.0    1472
Name: success, dtype: int64

In [5]:
df2 = pd.read_csv("data/aggregate_results_no_env.csv", index_col=0)
df2 = df2[['doi', 'file', 'success']]

In [6]:
print(str(len(df2))+' '+str(len(df2.doi.unique())))

7659 2071


In [7]:
df2.success.value_counts(dropna=False)

NaN    3829
0.0    2878
1.0     952
Name: success, dtype: int64

In [8]:
dfd = df1.merge(df2, on=['doi','file'], how='outer')

In [9]:
print(str(len(dfd))+' '+str(len(dfd.doi.unique())))

8609 2109


In [10]:
def get_result(r):
    if 1.0 in [r.success_x, r.success_y]:
        return 1.0
    if pd.isnull(r.success_x) or pd.isnull(r.success_y):
        return np.nan  
    else:
        return 0.0

dfd['result'] = dfd.apply(get_result, axis=1)

In [11]:
dfd.result.value_counts(dropna=False)

NaN    5790
1.0    1581
0.0    1238
Name: result, dtype: int64

In [12]:
df = pd.read_csv("data/rfile_stats.csv", delimiter='\t', \
                 names=['doi', 'file', 'comments_no', 'dep_no', 'func_no', \
                        'test_no', 'class_no','encoding', 'total_lines'])

df = df[['doi', 'file']]

In [13]:
df.head()

Unnamed: 0,doi,file
0,doi:10.7910/DVN/XFQZI2,Condemnation.R
1,doi:10.7910/DVN/WGPDBS,Replication_of_Figures.R
2,doi:10.7910/DVN/BPON3K,fig_10_effect_of_winning_on_gov.R
3,doi:10.7910/DVN/BPON3K,fig_11_rd_placebo.R
4,doi:10.7910/DVN/BPON3K,fig_12_historical_trend.R


# All success files with code cleaning

In [14]:
print('{} {}'.format(len(df), len(df1)))

8178 7414


In [15]:
# to avoid DOIs that had error in download exclude them from df
good_dois = df1.doi.unique()
df = df[df['doi'].isin(good_dois)]

In [16]:
len(df)

7928

In [17]:
dfm = df.merge(df1, on=['doi','file'], how='outer')

In [18]:
len(dfm)

8875

In [19]:
len(dfm.doi.unique())

2085

In [20]:
dfm.head()

Unnamed: 0,doi,file,success
0,doi:10.7910/DVN/XFQZI2,Condemnation.R,
1,doi:10.7910/DVN/WGPDBS,Replication_of_Figures.R,1.0
2,doi:10.7910/DVN/BPON3K,fig_10_effect_of_winning_on_gov.R,0.0
3,doi:10.7910/DVN/BPON3K,fig_11_rd_placebo.R,1.0
4,doi:10.7910/DVN/BPON3K,fig_12_historical_trend.R,1.0


In [21]:
dfm.success.value_counts(dropna=False)

NaN    5180
0.0    2223
1.0    1472
Name: success, dtype: int64

In [22]:
len(dfm.doi.unique())

2085

In [23]:
dfs = dfm.groupby(['doi'])['success'].value_counts(dropna=False).unstack().fillna(0)

In [24]:
len(dfs)

2085

In [25]:
dfs.head()

success,nan,0.0,1.0
doi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
doi:10.7910/DVN/03CDTK,2.0,0.0,0.0
doi:10.7910/DVN/05BSPP,17.0,3.0,0.0
doi:10.7910/DVN/0BFF0K,0.0,1.0,1.0
doi:10.7910/DVN/0BPVCH,3.0,0.0,0.0
doi:10.7910/DVN/0DE35E,2.0,0.0,1.0


In [26]:
dfs = dfs.reset_index()

In [27]:
dfs.head()

success,doi,nan,0.0,1.0
0,doi:10.7910/DVN/03CDTK,2.0,0.0,0.0
1,doi:10.7910/DVN/05BSPP,17.0,3.0,0.0
2,doi:10.7910/DVN/0BFF0K,0.0,1.0,1.0
3,doi:10.7910/DVN/0BPVCH,3.0,0.0,0.0
4,doi:10.7910/DVN/0DE35E,2.0,0.0,1.0


In [28]:
dfs.columns = ['doi', 'n', 'f', 's']
dfs.head()

Unnamed: 0,doi,n,f,s
0,doi:10.7910/DVN/03CDTK,2.0,0.0,0.0
1,doi:10.7910/DVN/05BSPP,17.0,3.0,0.0
2,doi:10.7910/DVN/0BFF0K,0.0,1.0,1.0
3,doi:10.7910/DVN/0BPVCH,3.0,0.0,0.0
4,doi:10.7910/DVN/0DE35E,2.0,0.0,1.0


In [29]:
def final(n, s, f):
    a = '0'
    b = '0'
    c = '0'
    if n > 0:
        a = '1'
    if s > 0:
        b = '1'
    if f > 0:
        c = '1'
    return a+b+c

In [30]:
dfs['final'] = dfs.apply(lambda x: final(x.n, x.s, x.f), axis=1)

In [31]:
dfs.final.value_counts()

001    655
100    638
110    235
011    168
101    144
010    129
111    116
Name: final, dtype: int64

In [32]:
dfs.n.sum()

5180.0

In [33]:
dfs.s.sum()

1472.0

In [34]:
dfs.f.sum()

2223.0

In [35]:
dfs.head()

Unnamed: 0,doi,n,f,s,final
0,doi:10.7910/DVN/03CDTK,2.0,0.0,0.0,100
1,doi:10.7910/DVN/05BSPP,17.0,3.0,0.0,101
2,doi:10.7910/DVN/0BFF0K,0.0,1.0,1.0,11
3,doi:10.7910/DVN/0BPVCH,3.0,0.0,0.0,100
4,doi:10.7910/DVN/0DE35E,2.0,0.0,1.0,110


In [36]:
dfs['count']=dfs.n+dfs.f+dfs.s

In [37]:
dfs[(dfs.final=='010') & (dfs['count'] == 3.0)].tail()

Unnamed: 0,doi,n,f,s,final,count
679,doi:10.7910/DVN/FMJDCD,0.0,0.0,3.0,10,3.0
1132,doi:10.7910/DVN/M7FYU8,0.0,0.0,3.0,10,3.0
1607,doi:10.7910/DVN/SWV9GJ,0.0,0.0,3.0,10,3.0
1781,doi:10.7910/DVN/VJTPJK,0.0,0.0,3.0,10,3.0
1794,doi:10.7910/DVN/VSIAGW,0.0,0.0,3.0,10,3.0


# Get a sample for RQ 10

In [304]:
dfs[(dfs.final=='010')].sample(n = 3) 

Unnamed: 0,doi,n,f,s,final,count
2023,doi:10.7910/DVN/Z02C8Y,0.0,0.0,1.0,10,1.0
787,doi:10.7910/DVN/H11ITR,0.0,0.0,2.0,10,2.0
2010,doi:10.7910/DVN/YT45AO,0.0,0.0,2.0,10,2.0


In [38]:
df1[df1.doi=='doi:10.7910/DVN/Z02C8Y']

Unnamed: 0,doi,file,success
1989,doi:10.7910/DVN/Z02C8Y,produce correl_plot.R,1.0


In [39]:
df1[df1.doi=='doi:10.7910/DVN/H11ITR']

Unnamed: 0,doi,file,success
3527,doi:10.7910/DVN/H11ITR,Modsel.R,1.0
3528,doi:10.7910/DVN/H11ITR,Vuong_R_Replication.r,1.0


In [311]:
df1[df1.doi=='doi:10.7910/DVN/YT45AO']

Unnamed: 0,doi,file,r32,r36,r40,result,success
808,doi:10.7910/DVN/YT45AO,bound_fn.R,success,success,success,success,1.0
809,doi:10.7910/DVN/YT45AO,bound_main.R,"Error in eval(expr, envir, enclos) : could not...",success,success,success,1.0


In [317]:
df1[df1.doi=='doi:10.7910/DVN/SWV9GJ']


Unnamed: 0,doi,file,r32,r36,r40,result,success
1077,doi:10.7910/DVN/SWV9GJ,Add Study 1 - analysis code.R,"Error in file(file, 'rt') : cannot open the co...","Error in file(file, 'rt') : cannot open the co...",success,success,1.0
1078,doi:10.7910/DVN/SWV9GJ,Add Study 2 - analysis code.R,"Error in file(file, 'rt') : cannot open the co...","Error in file(file, 'rt') : cannot open the co...",success,success,1.0
5601,doi:10.7910/DVN/SWV9GJ,TESS_analysis_code.R,,Error in alpha(sjt) : could not find function ...,success,success,1.0
