# Sustainability of Research Software Repositories
The lifespan of a repository is computed by means of the first and the last commit. If the last commit occurred in the past 12 months, the repository is considered live.

In [None]:
import yaml
import numpy as np
import matplotlib.pyplot as plt
import pandas as pandas
import collections
import modules.database as db

## Connect to Database Collection

In [None]:
rs_repo_table = db.RsRepoCollection()

## Set Basic Parameters for Analysis

In [None]:
plt.style.use('fivethirtyeight')
plt.rcParams.update({'font.size': 14})

## Auxiliary Functions

In [None]:
def percentage(part, whole):
    return round(100 * float(part)/float(whole), 2)

def get_characteristic_values(sample_name=None):
    total = rs_repo_table.get_number_of_entries({'group': {'$in': [sample_name]}})
    active = rs_repo_table.get_number_of_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':True}]})
    dormant =rs_repo_table.get_number_of_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':False}]})
    zero = rs_repo_table.get_number_of_entries({'$and': [{'group': {'$in': [sample_name]}},{'lifespan':0}]})
    print('Characteristic values for the', sample_name, 'sample: ')
    print('overall: ', total)
    print('active: ', active, percentage(active,total))
    print('dormant: ', dormant, percentage(dormant, total))
    print('one day: ', zero, percentage(zero, total))
    
    repos = rs_repo_table.get_entries({'group': {'$in': [sample_name]}})
    days = []
    for repo in repos:
        days.append(repo['lifespan'])
    print('median: ', np.median(days), 'std: ', np.std(days,ddof=1) )
    
    repos = rs_repo_table.get_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':True}]})
    days = []
    for repo in repos:
        days.append(repo['lifespan'])
    print('median for live repositories in', sample_name, ': ', np.median(days), 'std: ', np.std(days) )
    
    repos = rs_repo_table.get_entries({'$and': [{'group': {'$in': [sample_name]}},{'live':False}]})
    days = []
    for repo in repos:
        days.append(repo['lifespan'])
    print('median for dormant repositories in', sample_name, ': ', np.median(days), 'std: ', np.std(days) )

def plot_lifespan(sample):
    active = []
    repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': [sample]}}]})
    for repo in repos:
        active.append(repo['lifespan'])
    active = [x//365 for x in active]
    dormant = []
    repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': [sample]}}]})
    for repo in repos:
        dormant.append(repo['lifespan'])
    dormant = [x//365 for x in dormant]

    keys_active = collections.Counter(active).keys() # equals to list(set(words))
    values_active = collections.Counter(active).values() # counts the elements' frequency
    keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
    values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
    activeDict = dict(zip(keys_active,values_active))
    dormantDict = dict(zip(keys_dormant,values_dormant))

    for key in activeDict:
        if not key in dormantDict:
            dormantDict.update({key:0})
    for key in dormantDict:
        if not key in activeDict:
            activeDict.update({key:0})

    sortedActive = collections.OrderedDict(sorted(activeDict.items()))
    sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

    sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 9}
    sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 9}

    N = len(sortedAct)
    ind = np.arange(N)  # the x locations for the groups
    width = 0.27       # the width of the bars

    fig = plt.figure()
    ax = fig.add_subplot(111)

    yvals = [int(v) for v in sortedDorm.values()]
    rects1 = ax.bar(ind, yvals, width)
    zvals = [int(v) for v in sortedAct.values()]
    rects2 = ax.bar(ind+width, zvals, width)

    ax.set_ylabel('Repositories')
    ax.set_xlabel('Years')
    ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

    # plt.savefig("lifespanArXiv.pdf", bbox_inches = "tight")
    print(sample, 'group: Lifetime, limited to 10 years, of live and dormant repositories: ')
    plt.show()

## Sustainability Analysis for the GitHub, ACM, and arXiv group

In [None]:
get_characteristic_values('github')

In [None]:
get_characteristic_values('acm')

In [None]:
get_characteristic_values('arxiv')

In [None]:
plot_lifespan('github')

In [None]:
plot_lifespan('acm')

In [None]:
plot_lifespan('arxiv')