# Sustainability of Research Software Repositories
The lifespan of a repository is computed by means of the first and the last commit. If the last commit occurred in the past 12 months, the repository is considered live.

In [None]:
import yaml
import numpy as np
import matplotlib.pyplot as plt
import pandas as pandas
import collections
import modules.database as db

## Connect to Database Collection

In [None]:
rs_repo_table = db.RsRepoCollection()

## Set Basic Parameters for Analysis

In [None]:
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 14})

## Auxiliary Functions

The **get_charateristic_values** function receives a sample name (github, acm, arxiv) and returns the characteristic values regarding the lifespan of these repositories.    
The **plot_lifespan** function receives a sample name (github, acm, arxiv) and plots a bar chart for the lifespan of the active and dormant repositories.

In [None]:
def percentage(part, whole):
    return round(100 * float(part)/float(whole), 2)

def get_characteristic_values(sample_name=None):
    total = rs_repo_table.get_number_of_entries({'group': {'$in': [sample_name]}})
    active = rs_repo_table.get_number_of_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':True}]})
    dormant =rs_repo_table.get_number_of_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':False}]})
    one_day = rs_repo_table.get_number_of_entries({'$and': [{'group': {'$in': [sample_name]}},{'lifespan':0}]})
    print('Characteristic values for the', sample_name, 'sample: ')
    print('number of repositories: ', total)
    print('active: ', active, percentage(active,total))
    print('dormant: ', dormant, percentage(dormant, total))
    print('one day: ', one_day, percentage(one_day, total))
    
    repos = rs_repo_table.get_entries({'group': {'$in': [sample_name]}})
    days = []
    for repo in repos:
        days.append(repo['lifespan'])
    print('median: ', np.median(days), 'std: ', np.std(days,ddof=1) )
    
    repos = rs_repo_table.get_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':True}]})
    days = []
    for repo in repos:
        days.append(repo['lifespan'])
    print('median for live repositories in', sample_name, ': ', np.median(days), 'std: ', np.std(days) )
    
    repos = rs_repo_table.get_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':False}]})
    days = []
    for repo in repos:
        days.append(repo['lifespan'])
    print('median for dormant repositories in', sample_name, ': ', np.median(days), 'std: ', np.std(days) )


def plot_lifespan(sample):
    active = []
    repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': [sample]}}]})
    for repo in repos:
        active.append(repo['lifespan'])
    active = [x//365 for x in active]
    dormant = []
    repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': [sample]}}]})
    for repo in repos:
        dormant.append(repo['lifespan'])
    dormant = [x//365 for x in dormant]

    keys_active = collections.Counter(active).keys() # equals to list(set(words))
    values_active = collections.Counter(active).values() # counts the elements' frequency
    keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
    values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
    activeDict = dict(zip(keys_active,values_active))
    dormantDict = dict(zip(keys_dormant,values_dormant))

    for key in activeDict:
        if not key in dormantDict:
            dormantDict.update({key:0})
    for key in dormantDict:
        if not key in activeDict:
            activeDict.update({key:0})

    sortedActive = collections.OrderedDict(sorted(activeDict.items()))
    sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

    sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 9}
    sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 9}

    N = len(sortedAct)
    ind = np.arange(N)  # the x locations for the groups
    width = 0.27       # the width of the bars

    fig = plt.figure()
    ax = fig.add_subplot(111)

    yvals = [int(v) for v in sortedDorm.values()]
    rects1 = ax.bar(ind, yvals, width)
    zvals = [int(v) for v in sortedAct.values()]
    rects2 = ax.bar(ind+width, zvals, width)

    ax.set_ylabel('Repositories')
    ax.set_xlabel('Years')
    ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

    # plt.savefig("lifespan.pdf", bbox_inches = "tight")
    print(sample, 'group: Lifetime, limited to 8 years, of live and dormant repositories: ')
    plt.show()

## Replicated Sustainability Analysis for the GitHub, ACM, and arXiv group

In [None]:
get_characteristic_values('github')

In [None]:
get_characteristic_values('acm')

In [None]:
get_characteristic_values('arxiv')

In [None]:
plot_lifespan('github')

In [None]:
plot_lifespan('acm')

In [None]:
plot_lifespan('arxiv')

## Additional Sustainability Analysis for Computer Science and Computational Science
By means of the related publications, research subjects and research fields are assigned to each repository. Hence, the repositories may also be grouped by computer science and computational science. The computer science sample comprises all repositories that are assigned to the research field computer science. The remaining repositories, only assigned to one research field other than computer science, are accumulated in the computational science sample. In addition, repositories with multiple research fields including computer
science are grouped into one sample, as well as repositories with multiple research fields not including computer science.     
In three of these four samples, the live and dormant repositories have nearly the same share. Except for the repositories with multiple research fields which include computer science. In this sample, the dormant repositories are somewhat stronger represented. Also the proportion of repositories with a lifespan of one day is almost the same in all samples. It ranges from 13.04% to 17.29%.      
Differences between the four samples are observed in the distribution of the repository lifespans. The lifespan of the repositories in the computational science sample is distributed with a median of 168 days. The repositories with multiple research fields, not including computer science, have a lifespan distribution with a median of 209 days, similar to the repositories from the computer science sample (217 days). The lifespan distribution with the highest median is discovered for the repositories with multiple research fields that also belong to computer science.

### Computational Science

In [None]:
total = rs_repo_table.get_number_of_entries({'$and':[{'subject': {'$nin': ['Computer Science']}},{'subject':{'$size':1}}]})
active = rs_repo_table.get_number_of_entries({'$and': [{'live':True}, {'subject': {'$nin': ['Computer Science']}},{'subject':{'$size':1}}]})
dormant =rs_repo_table.get_number_of_entries({'$and': [{'live':False}, {'subject': {'$nin': ['Computer Science']}},{'subject':{'$size':1}}]})
one_day = rs_repo_table.get_number_of_entries({'$and': [{'lifespan':0}, {'subject': {'$nin': ['Computer Science']}},{'subject':{'$size':1}}]})
print("Number of repositories: ", total)
print('active: ', active, percentage(active,total))
print('dormant: ', dormant, percentage(dormant, total))
print('one day: ', one_day, percentage(one_day, total))
repos = rs_repo_table.get_entries({'$and':[{'subject': {'$nin': ['Computer Science']}},{'subject':{'$size':1}}]})
days = []
for repo in repos:
    days.append(repo['lifespan'])
print('median: ', np.median(days), 'std: ', np.std(days,ddof=1)  )

### Computer Science

In [None]:
total = rs_repo_table.get_number_of_entries({'$and':[{'subject': {'$in': ['Computer Science']}},{'subject':{'$size':1}}]})
active = rs_repo_table.get_number_of_entries({'$and': [{'live':True}, {'subject': {'$in': ['Computer Science']}},{'subject':{'$size':1}}]})
dormant =rs_repo_table.get_number_of_entries({'$and': [{'live':False}, {'subject': {'$in': ['Computer Science']}},{'subject':{'$size':1}}]})
zero = rs_repo_table.get_number_of_entries({'$and': [{'lifespan':0}, {'subject': {'$in': ['Computer Science']}},{'subject':{'$size':1}}]})
print("Number of repositories: ", total)
print('active: ', active, percentage(active,total))
print('dormant: ', dormant, percentage(dormant, total))
print('one day: ', zero, percentage(zero, total))
repos = rs_repo_table.get_entries({'$and':[{'subject': {'$in': ['Computer Science']}},{'subject':{'$size':1}}]})
days = []
for repo in repos:
    days.append(repo['lifespan'])
print('median: ', np.median(days), 'std: ', np.std(days,ddof=1)  )

### Multidisciplinary, Computer Science included

In [None]:
total = rs_repo_table.get_number_of_entries({'$and':[{'subject': {'$in': ['Computer Science']}},{'subject.1': {'$exists': True}}]})
active = rs_repo_table.get_number_of_entries({'$and': [{'subject': {'$in': ['Computer Science']}},{'live':True}, {'subject.1': {'$exists': True}}]})
dormant =rs_repo_table.get_number_of_entries({'$and': [{'subject': {'$in': ['Computer Science']}},{'live':False}, {'subject.1': {'$exists': True}}]})
zero = rs_repo_table.get_number_of_entries({'$and': [{'subject': {'$in': ['Computer Science']}},{'lifespan':0}, {'subject.1': {'$exists': True}}]})
print("Number of repositories: ", total)
print('active: ', active, percentage(active,total))
print('dormant: ', dormant, percentage(dormant, total))
print('one day: ', zero, percentage(zero, total))
repos = rs_repo_table.get_entries({'$and':[{'subject': {'$in': ['Computer Science']}},{'subject.1': {'$exists': True}}]})
days = []
for repo in repos:
    days.append(repo['lifespan'])
print('median: ', np.median(days), 'std: ', np.std(days,ddof=1)  )

### Multidisciplinary, Computer Science excluded

In [None]:
total = rs_repo_table.get_number_of_entries({'$and':[{'subject': {'$nin': ['Computer Science']}},{'subject.1': {'$exists': True}}]})
active = rs_repo_table.get_number_of_entries({'$and': [{'subject': {'$nin': ['Computer Science']}},{'live':True}, {'subject.1': {'$exists': True}}]})
dormant =rs_repo_table.get_number_of_entries({'$and': [{'subject': {'$nin': ['Computer Science']}},{'live':False}, {'subject.1': {'$exists': True}}]})
zero = rs_repo_table.get_number_of_entries({'$and': [{'subject': {'$nin': ['Computer Science']}},{'lifespan':0}, {'subject.1': {'$exists': True}}]})
print("Number of repositories: ", total)
print('active: ', active, percentage(active,total))
print('dormant: ', dormant, percentage(dormant, total))
print('one day: ', zero, percentage(zero, total))
repos = rs_repo_table.get_entries({'$and':[{'subject': {'$nin': ['Computer Science']}},{'subject.1': {'$exists': True}}]})
days = []
for repo in repos:
    days.append(repo['lifespan'])
print('median: ', np.median(days), 'std: ', np.std(days,ddof=1)  )