***Research Software Analyzer***

Prerequisites:

    MongoDB
    Jupyter Notebook
    Packages (see requirements_analyzer.txt)
    Configuration File (in the same folder)

Getting started:

    Create a virtual environment
    Install required packages (pip install requirements_analyzer.txt)
    Specify parameter in the configuration file
    Run this Notebook

In [None]:
import yaml
import numpy as np
import matplotlib.pyplot as plt
import pandas as pandas
import collections
import modules.database as db

In [None]:
# look up the config.yaml file and load the required parameter

with open("config.yaml", 'r') as stream:
    params = yaml.safe_load(stream)

plt.style.use('fivethirtyeight')
#plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams.update({'font.size': 18})

repo_table = db.RepoCollection()
publication_table = db.Collection('publications')
rs_repo_table = db.RsRepoCollection()
rs_artifact_table = db.RsPublicationCollection()

In [None]:
def percentage(part, whole):
    return round(100 * float(part)/float(whole), 2)

def cursor_to_dict_mixed(cursor, limit=None):
    areas = {'Others': 0} if limit else {}
    for elem in cursor:
        for tmp in elem['counts']:
            if not tmp['k']:
                pass
            elif limit and tmp['v'] < limit:
                areas['Others'] = areas['Others'] + tmp['v']
            elif tmp['k'] == ['']:
                if 'Multidisciplinary' in areas:
                    areas['Multidisciplinary'] = areas['Multidisciplinary'] + tmp['v']
                else:
                    areas['Multidisciplinary'] = tmp['v']            
            elif len(tmp['k']) > 1:
                if 'Mixed' in areas:
                    areas['Mixed'] = areas['Mixed'] + tmp['v']
                else:
                    areas['Mixed'] = tmp['v']
            else:
                areas[tmp['k'][0]] = tmp['v']
    return areas

def cursor_to_dict(cursor, limit=None):    
    areas = {'Others': 0} if limit else {}
    for elem in overview:
        for tmp in elem['counts']:
            if not tmp['k']:
                pass
            elif limit and tmp['v'] < limit:
                areas['Others'] = areas['Others'] + tmp['v']
            elif tmp['k'] == ['']:
                if 'Multidisciplinary' in areas:
                    areas['Multidisciplinary'] = areas['Multidisciplinary'] + tmp['v']
                else:
                    areas['Multidisciplinary'] = tmp['v']            
            elif len(tmp['k']) > 1:
                for elem in tmp['k']:
                    if elem in areas:
                        areas[elem] = areas[elem] + tmp['v']
                    elif not elem:
                        if 'Multidisciplinary' in areas:
                            areas['Multidisciplinary'] = areas['Multidisciplinary'] + tmp['v']
                        else:
                            areas['Multidisciplinary'] = tmp['v']                    
                    else:
                        areas[elem] = tmp['v']
            else:
                areas[tmp['k'][0]] = tmp['v']
    return areas

***Description of the Repositories Set***

In [None]:
print('Repositories Set Overview')
total = repo_table.get_number_of_entries({})
print(total, 'harvested repositories')

part = repo_table.get_number_of_entries({'$and': [{'readme':{'$ne':'empty readme'}},{'readme':{'$exists':True}}]})
print(part, '(', percentage(part, total), '%) repositories with a Readme file')

total_github = repo_table.get_number_of_entries({'$and': [{'readme':{'$ne':'empty readme'}},{'readme':{'$exists':True}}]})
part = repo_table.get_number_of_entries({'$and': [{'readme':{'$eq':'empty readme'}},{'readme':{'$exists':True}}]})
print(part, '(', percentage(part, total_github), '%) repositories without a Readme file')

part = repo_table.get_number_of_entries({'readme':{'$exists':False}})
print(part, '(', percentage(part, total), '%) repositories without requested Readme file')

part = repo_table.get_number_of_entries({'license':{'$ne':None}})
print(part, '(', percentage(part, total), '%) repositories with a license')

part = repo_table.get_number_of_entries({'archived':True})
print(part, '(', percentage(part, total), '%) archived repositories')

part = repo_table.get_number_of_entries({'fork':True})
print(part, '(', percentage(part, total), '%) forks')

part = repo_table.get_number_of_entries({'forks':{'$gt':0}})
print(part, '(', percentage(part, total), '%) forked repositories')

part = repo_table.get_number_of_entries({'language':{'$ne':None}})
print(part, '(', percentage(part, total), '%) repositories without an assigned primary language \n')

***Description of the Publications Set***

In [None]:
print('Publications Set Overview')
print('Total number of harvested publications: ', publication_table.get_number_of_entries({}))
print('Number of publications harvested from arXiv', publication_table.get_number_of_entries({'source':'arxiv'}))
print('Number of publications harvested from ACM', publication_table.get_number_of_entries({'source':'acm'}))
print('Number of publications without a title: ', publication_table.get_number_of_entries({'title':None}))
print('Number of publications without a DOI: ', publication_table.get_number_of_entries({'doi':''}), '\n')

In [None]:
print('Research Software Artifacts Set Overview')
total = rs_artifact_table.get_number_of_entries({})
print(total, 'research software artifacts')

part = rs_artifact_table.get_number_of_entries({'identifier.mode':'doi'})
print(percentage(part, total), '% (', part, ') research software artifacts are identified by DOI')
part = rs_artifact_table.get_number_of_entries({'identifier.mode':'arxiv_id'})
print(percentage(part, total), '% (', part, ') research software artifacts are identified by arXiv ID \n')

In [None]:
print('Subsample: Artifacts identified by DOI')
print('Harvested DOI metadata:')
total = rs_artifact_table.get_number_of_entries({})
total_doi = rs_artifact_table.get_number_of_entries({'identifier.mode':'doi'})

part = rs_artifact_table.get_number_of_entries({'source':'Crossref'})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% in overall set, in total', 
      part, ') research software artifacts with DOI metadata\n')

print('Artifacts with type specification')
part = rs_artifact_table.get_number_of_entries({"type" :None})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% in overall set, in total', 
      part, ') artifacts without type specification')
part = rs_artifact_table.get_number_of_entries({"type" :{'$ne': None}})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% in overall set, in total', 
      part, ') artifacts with type specification')
part = rs_artifact_table.get_number_of_entries({'type':'journal-article'})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% in overall set, in total', 
      part, ') journal articles')
part = rs_artifact_table.get_number_of_entries({'type':'proceedings-article'})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% in overall set, in total', 
      part, ') proceedings article')

In [None]:
overview = rs_artifact_table.compose_type()
areas = {'Others':0}
for elem in overview:
    for item in elem['counts']:
        if not item['k']:
            areas['Not set'] = item['v']
        elif item['v'] < 1000:
            areas['Others'] = areas['Others'] + item['v']
        else:
            areas[item['k']] = item['v']

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=15, pctdistance=0.8)

plt.axis('equal')
plt.savefig("eval/doiTypes.pdf", bbox_inches = "tight")
print('Artifact types, as stated in the DOI metadata:')
plt.show()

In [None]:
overview = rs_artifact_table.compose_type()
areas = {'Others':0}
for elem in overview:
    for item in elem['counts']:
        if not item['k']:
            pass
        elif item['v'] < 1000:
            areas['Others'] = areas['Others'] + item['v']
        else:
            areas[item['k']] = item['v']

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
plt.savefig("eval/doiTypesN.pdf", bbox_inches = "tight")
#print('Artifact types excluding empty fields, as stated in the DOI metadata:')
plt.show()

In [None]:
overview = rs_artifact_table.compose_type()
print('All types:')
for elem in overview:
    for item in elem['counts']:
        print(item['k'], ': ', item['v'])

In [None]:
print('Subsample: Artifacts identified by DOI')
print('Assigned subjects:')
total = rs_artifact_table.get_number_of_entries({})
total_doi = rs_artifact_table.get_number_of_entries({'identifier.mode':'doi'})
total_crossref = rs_artifact_table.get_number_of_entries({'source':'Crossref'})
total_issn = rs_artifact_table.get_number_of_entries({'$or': [{'ISSN':{'$exists':True}}, {'ISBN':{'$exists':True}}]})
groups = {'DOI': total_doi}
groups['metadata'] = total_crossref

part = rs_artifact_table.get_number_of_entries({'ISSN':{'$exists':True}})
groups['ISSN'] = part
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% of overall set, in total', 
      part, ') research software artifacts with ISSN')
part = rs_artifact_table.get_number_of_entries({'ISBN':{'$exists':True}})
groups['ISBN'] = part
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% of the entire research artifacts set, in total', 
      part, ') research software artifacts with ISBN \n')

part = rs_artifact_table.get_number_of_entries({'$or': [{'ISSN':{'$exists':True}}, {'ISBN':{'$exists':True}}]})
print(percentage(part, total_crossref), '% (', 
      percentage(part, total_doi) ,'% of DOI set,', 
      percentage(part, total) ,'% of overall set, in total', 
      part, ') research software artifacts with ISSN or ISBN')
part = rs_artifact_table.get_number_of_entries({'$and': [{'ISSN':{'$exists':True}}, {'ISBN':{'$exists':True}}]})
print(percentage(part, total_crossref), '% (', 
      percentage(part, total_doi) ,'% of DOI set, in total', 
      part, ') research software artifacts with ISSN and ISBN')
part = rs_artifact_table.get_number_of_entries({'ISSN':{'$exists':True}})
print(percentage(part, total_crossref), '% (', 
      percentage(part, total_doi) ,'% of DOI set, in total', 
      part, ') research software artifacts with ISSN')
part = rs_artifact_table.get_number_of_entries({'ISBN':{'$exists':True}})
print(percentage(part, total_crossref), '% (', 
      percentage(part, total_doi) ,'% of DOI set, in total', 
      part, ') research software artifacts with ISBN')
part = rs_artifact_table.get_number_of_entries(
    {'$and': [{'identifier.mode':'doi'}, {'main_subject':{'$exists':True}}]})
groups['Subject'] = part
print(part, total)
print(percentage(part, total_issn), '% (', 
      percentage(part, total_doi) ,'% of DOI set,', 
      percentage(part, total) ,'% of overall set, in total', 
      part, ') research software artifacts have an assigned main subject')
part = rs_artifact_table.get_number_of_entries(
    {'$and':[{"type" : "journal-article"},{'main_subject':{'$exists':False}}]})
print(percentage(part, total_issn), '% (', 
      percentage(part, total_doi) ,'% of the entire research artifacts set, in total', 
      part, ') journal articles (identified by DOI metadata) have no assigned subject (rs_identifier)')


plt.bar(range(len(groups)), list(groups.values()), align='center', tick_label=list(groups.values()))
plt.ylabel('Number of artifacts')
plt.xlabel('Artifacts with ...')
plt.xticks(range(len(groups)), list(groups.keys()))
#plt.savefig("eval/issnDoi.pdf", bbox_inches = "tight")
print('ISSN, ISBN in DOI sub sample:')
plt.show()

In [None]:
print('Subsample: Artifacts identified by DOI')
print('Assigned subjects:')
total = rs_artifact_table.get_number_of_entries({})
total_doi = rs_artifact_table.get_number_of_entries({'identifier.mode':'doi'})
total_crossref = rs_artifact_table.get_number_of_entries({'source':'Crossref'})

part = rs_artifact_table.get_number_of_entries(
    {'$and': [{'identifier.mode':'doi'}, {'subject':{'$exists':True}}]})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% of the entire research artifacts set, in total', 
      part, ') research software artifacts (DOI) have subject infos in the DOI metadata')
part = rs_artifact_table.get_number_of_entries(
    {'$and': [{'identifier.mode':'doi'}, {'subject':{'$exists':True}}, {'main_subject':{'$exists':True}}]})
print(percentage(part, total_doi), '% (', 
      percentage(part, total) ,'% of the entire research artifacts set, in total', 
      part, ') research software artifacts (DOI) with DOI metadata subject and assigned subject \n')
#part = rs_artifact_table.get_number_of_entries(
#    {'$and':[{"type" : "journal-article"},{'main_subject':{'$exists':False}},{'subject':{'$exists':False}}]})
#print(percentage(part, total_crossref), '% (', 
#      percentage(part, total) ,'% of the entire research artifacts set, in total', 
#      part, ') journal articles (identified by DOI metadata) have no assigned subject \n')

In [None]:
artifacts = rs_artifact_table.get_entries(
    {'$and': [{'identifier.mode':'doi'}, {'subject':{'$exists':True}}, {'main_subject':{'$exists':True}}]})
equal = 0
not_equal = 0
bioMulti = 0

for elem in artifacts:
    if set(elem['subject']) == set(elem['sub_subject']):
        equal = equal + 1
    else:
        not_equal = not_equal + 1
        if (elem['subject'] == ['General Biochemistry, Genetics and Molecular Biology', 'General Agricultural and Biological Sciences', 'General Medicine'] 
            and elem['sub_subject'] == ['Multidisciplinary']):
            bioMulti = bioMulti + 1
print('Number of equal DOI subject and rs_identifier subject: ', equal)
print('Number of unequal DOI subject and rs_identifier subject: ', not_equal)
print('General Biochemistry - Multidisciplinary:', bioMulti, '\n')

In [None]:
print('Description of the research software repository set')
total = rs_repo_table.get_number_of_entries({})
print(total, 'research software repositories')

In [None]:
# description of the subsample - repositories with their number of associated groups
repos = rs_repo_table.compose_group(False)
containedIn = []

for repo in repos:
    containedIn.append(repo['containedIn'])
distribution=dict(zip(list(containedIn),[list(containedIn).count(i) for i in list(containedIn)]))
print(distribution)
D = collections.OrderedDict(sorted(distribution.items()))
plt.bar(range(len(D)), list(D.values()), align='center', tick_label=list(D.values()))
plt.xticks(range(len(D)), list(D.keys()))
plt.xlabel('Number of containing analysis groups')
plt.ylabel('Number of repositories')
#plt.savefig("eval/containedIn.pdf", bbox_inches = "tight")
plt.show()

In [None]:
print('Repositories belonging to two analysis groups')
total = rs_repo_table.get_number_of_entries({})
groups = {}
part = rs_repo_table.get_number_of_entries({'group':{ '$all': [ 'github' ,'acm' ] }})
groups['Github ACM'] = part
print(percentage(part, total), '% (', part, ') number of repositories in the github and acm group')
part = rs_repo_table.get_number_of_entries({'group':{ '$all': [ 'github' ,'arxiv' ] }})
groups['GitHub arXiv'] = part
print(percentage(part, total), '% (', part, ') number of repositories in the github and arxiv group')
part = rs_repo_table.get_number_of_entries({'group':{ '$all': [ 'arxiv' ,'acm' ] }})
groups['arXiv ACM'] = part
print(percentage(part, total), '% (', part, ') number of repositories in the arxiv and acm group')

print(groups)
plt.bar(range(len(groups)), list(groups.values()), align='center', tick_label=list(groups.values()))
plt.xticks(range(len(groups)), list(groups.keys()))
plt.xlabel('Name of the two analysis groups')
plt.ylabel('Number of rs repositories')
#plt.savefig("eval/group2.pdf", bbox_inches = "tight")

plt.show()

In [None]:
print('Analysis groups:')
total = rs_repo_table.get_number_of_entries({})
groups = {}
category = ['github', 'acm', 'arxiv']

for elem in category:
    part = rs_repo_table.get_number_of_entries({ "group": { '$in': [elem] } })
    groups[elem] = part
    print(percentage(part, total), '% (in total',
          part, ') number of repositories in the', elem, 'set')

print(groups)
plt.bar(range(len(groups)), list(groups.values()), align='center', tick_label=list(groups.values()))
plt.xticks(range(len(groups)), list(groups.keys()))
plt.xlabel('Analysis group')
plt.ylabel('No of repositories')
#plt.savefig("eval/group.pdf", bbox_inches = "tight")

plt.show()

In [None]:
# description of the subsample - repositories with their number of associated groups
repos = rs_repo_table.compose_group(True)
containedIn = []

for repo in repos:
    containedIn.append(repo['containedIn'])
distribution=dict(zip(list(containedIn),[list(containedIn).count(i) for i in list(containedIn)]))
print('Repositories with their number of associated analysis groups')
print(distribution)
D = collections.OrderedDict(sorted(distribution.items()))
plt.bar(range(len(D)), list(D.values()), align='center')
plt.xticks(range(len(D)), list(D.keys()))
#plt.savefig("eval/containedInS.pdf", bbox_inches = "tight")

plt.show()

In [None]:
print('Analysis groups:')
total = rs_repo_table.get_number_of_entries({'main_subject':{'$exists':True}})
groups = {}
category = ['github', 'acm', 'arxiv']

for elem in category:
    part = rs_repo_table.get_number_of_entries({'$and': [{'main_subject':{'$exists':True}},{ "group": { '$in': [elem] } }]})
    groups[elem] = part
    print(percentage(part, total), '% (in total',
          part, ') number of repositories in the GitHub set')

print(groups)
plt.bar(range(len(groups)), list(groups.values()), align='center', tick_label=list(groups.values()))
plt.xticks(range(len(groups)), list(groups.keys()))
plt.xlabel('Analysis group')
plt.ylabel('No of repositories')
#plt.savefig("eval/groupS.pdf", bbox_inches = "tight")

plt.show()

In [None]:
print('Repositories with subject belonging to two analysis groups')
total = rs_repo_table.get_number_of_entries({'main_subject':{'$exists':True}})
groups = {}
part = rs_repo_table.get_number_of_entries({'$and': [{'main_subject':{'$exists':True}},{'group':{ '$all': [ 'github' ,'acm' ] }}]})
groups['GitHub ACM'] = part
print(percentage(part, total), '% (', part, ') number of repositories in the github and acm group')
part = rs_repo_table.get_number_of_entries({'$and': [{'main_subject':{'$exists':True}},{'group':{ '$all': [ 'github' ,'arxiv' ] }}]})
groups['GitHub arXiv'] = part
print(percentage(part, total), '% (', part, ') number of repositories in the github and arxiv group')
part = rs_repo_table.get_number_of_entries({'$and': [{'main_subject':{'$exists':True}},{'group':{ '$all': [ 'arxiv' ,'acm' ] }}]})
groups['arXiv ACM'] = part
print(percentage(part, total), '% (', part, ') number of repositories in the arxiv and acm group')

print(groups)
plt.bar(range(len(groups)), list(groups.values()), align='center', tick_label=list(groups.values()))
plt.xticks(range(len(groups)), list(groups.keys()))
plt.xlabel('Name of the two analysis groups')
plt.ylabel('Number of rs repositories')
#plt.savefig("eval/group2S.pdf", bbox_inches = "tight")

plt.show()

***RQ1: Research Subjects***

In [None]:
# All repositories, main subject, Mixed category

overview = rs_repo_table.compose_subjects('main_subject')

areas = cursor_to_dict_mixed(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=-50, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/allSubjectMainMixed.pdf", bbox_inches = "tight")
print('Main research subjects of all research software repositories, with Mixed category:')
plt.show()

In [None]:
# all repositories, main subject

overview = rs_repo_table.compose_subjects('main_subject')

areas = cursor_to_dict(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/allSubjectMain.pdf", bbox_inches = "tight")
print('Main research subjects of all research software repositories:')
plt.show()

In [None]:
# all repositories, subject, Mixed category

overview = rs_repo_table.compose_subjects('subject')
areas = cursor_to_dict_mixed(overview, 1200)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=75, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/allSubjectMixed.pdf", bbox_inches = "tight")
print('Research subjects of all research software repositories, with Mixed category:')
plt.show()

In [None]:
# all repositories, subject, without Mixed category

overview = rs_repo_table.compose_subjects('subject')

areas = cursor_to_dict(overview, 1200)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/allSubject.pdf", bbox_inches = "tight")
print('Research subject of all research software repositories:')
plt.show()

In [None]:
# github group, main subject, mixed category

overview = rs_repo_table.compose_subjects('main_subject', 'github')

areas = cursor_to_dict_mixed(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/githubSubjectMainMixed.pdf", bbox_inches = "tight")
print('GitHub group: main research subjects of research software repositories, with Mixed category:')
plt.show()

In [None]:
# github group, main subject

overview = rs_repo_table.compose_subjects('main_subject','github')
areas = cursor_to_dict(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)
plt.axis('equal')
#plt.savefig("eval/githubSubjectMain.pdf", bbox_inches = "tight")
print('GitHub group: main research subjects of research software repositories:')
plt.show()

In [None]:
# acm group, main subject, mixed category

overview = rs_repo_table.compose_subjects('main_subject','acm')

areas = cursor_to_dict_mixed(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=45, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/acmSubjectMainMixed.pdf", bbox_inches = "tight")
print('ACM group: main research subjects of research software repositories, with Mixed category:')
plt.show()

In [None]:
# acm group, main subject

overview = rs_repo_table.compose_subjects('main_subject','acm')

areas = cursor_to_dict(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=45, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/acmSubjectMain.pdf", bbox_inches = "tight")
print('ACM group: main research subjects of research software repositories:')
plt.show()

In [None]:
# arxiv group, main subject, mixed

overview = rs_repo_table.compose_subjects('main_subject', 'arxiv')
areas = cursor_to_dict_mixed(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/arxivSubjectMainMixed.pdf", bbox_inches = "tight")
print('arXiv group: main research subjects of research software repositories, with Mixed category:')
plt.show()

In [None]:
# arxiv group, main subject

overview = rs_repo_table.compose_subjects('main_subject', 'arxiv')
areas = cursor_to_dict(overview)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/arxivSubjectMain.pdf", bbox_inches = "tight")
print('arXiv group: main research subjects of research software repositories:')
plt.show()

In [None]:
# github group, subject, mixed

overview = rs_repo_table.compose_subjects('subject', 'github')

areas = cursor_to_dict_mixed(overview, 500)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/githubSubjectMixed.pdf", bbox_inches = "tight")
print('GitHub group: research subjects of research software repositories, with Mixed category:')
plt.show()

In [None]:
# github group, subject

overview = rs_repo_table.compose_subjects('subject', 'github')
areas = cursor_to_dict(overview, 700)
            
plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/githubSubject.pdf", bbox_inches = "tight")
print('GitHub group: research subjects of research software repositories:')
plt.show()

In [None]:
# acm group, subject, mixed

overview = rs_repo_table.compose_subjects('subject', 'acm')
areas = cursor_to_dict_mixed(overview, 200)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/acmSubjectMixed.pdf", bbox_inches = "tight")
print('ACM group: research subjects of research software repositories, with Mixed category:')
plt.show()

In [None]:
# acm group, subject

overview = rs_repo_table.compose_subjects('subject', 'acm')
areas = cursor_to_dict(overview, 400)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/acmSubject.pdf", bbox_inches = "tight")
print('ACM group: research subjects of research software repositories:')
plt.show()

In [None]:
# arXiv group, subject

overview = rs_repo_table.compose_subjects('subject', 'arxiv')
areas = cursor_to_dict(overview, 200)

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/arxivSubject.pdf", bbox_inches = "tight")
print('arXiv group: research subjects of research software repositories:')
plt.show()

In [None]:
# arXiv group, computer science 

overview = rs_repo_table.compose_arxiv_cs()
areas = cursor_to_dict(overview, 70) 

plt.pie([float(v) for v in areas.values()], labels=[k for k in areas.keys()],
        autopct='%1.1f%%', startangle=0, pctdistance=0.8)

plt.axis('equal')
#plt.savefig("eval/arxivCS.pdf", bbox_inches = "tight")
print('arXiv group: computer science of research software repositories:')
plt.show()

***RQ2: Repository Lifetime***

In [None]:
# github group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['github']}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['github']}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]

keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xlabel('Years')
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/githubLifetimeAll.pdf", bbox_inches = "tight")
print('GitHub group: Lifetime of live and dormant repositories')
plt.show()

In [None]:
# github group
# description of the subsample - repository lifetime sorted by life and dormant, limited to 10 years
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['github']}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['github']}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]

keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedAct)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDorm.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedAct.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xlabel('Years')
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/githubLifetimeAll10.pdf", bbox_inches = "tight")
print('GitHub group: Lifetime, limited to 10 years, of live and dormant repositories')
plt.show()

In [None]:
# github group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['github']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['github']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]

keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xlabel('Years')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedActive.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/githubLifetimeSubject.pdf", bbox_inches = "tight")
print('GitHub group: Lifetime of the repositories with a main subject:')
plt.show()

In [None]:
# github group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['github']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['github']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]

keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedAct)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDorm.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedAct.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedAct.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/githubLifetimeSubject10.pdf", bbox_inches = "tight")
print('GitHub group: Lifetime, limited to 10 year, of the repositories with a main subject:')
plt.show()

In [None]:
# acm group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['acm']}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['acm']}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xlabel('Years')
#ax.set_xticks(ind+width)
#ax.set_xticklabels( [k for k in sortedActive.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/acmLifetimeAll.pdf", bbox_inches = "tight")
print('ACM group: lifetime of the research software repositories:')
plt.show()

In [None]:
# acm group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['acm']}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['acm']}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))
sortedActive = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDormant = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xlabel('Years')
#ax.set_xticks(ind+width)
#ax.set_xticklabels( [k for k in sortedActive.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/acmLifetimeAll10.pdf", bbox_inches = "tight")
print('ACM group: lifetime, limited to 10 years, of the research software repositories:')
plt.show()

In [None]:
# acm group, subject
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['acm']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['acm']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedActive.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/acmLifetimeSubject.pdf", bbox_inches = "tight")
print('ACM group: lifetime of the repositories with a main subject:')
plt.show()

In [None]:
# acm group, subject
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['acm']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['acm']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedAct)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDorm.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedAct.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedAct.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/acmLifetimeSubject10.pdf", bbox_inches = "tight")
print('ACM group: lifetime, limited to 10 years, of the repositories with a main subject')
plt.show()

In [None]:
# acm group, subject, first commit before 2018
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [
    {'live':True}, 
    {'group': {'$in': ['acm']}}, 
    {'main_subject':{'$exists':True}},
    {'first_commit':{'$regex': r'2008|2009|2010|2011|2012|2013|2014|2015|2016|2017'}}
]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [
    {'live':False}, 
    {'group': {'$in': ['acm']}}, 
    {'main_subject':{'$exists':True}},
    {'first_commit':{'$regex': r'2008|2009|2010|2011|2012|2013|2014|2015|2016|2017'}}
]})
for repo in repos:
    dormant.append(repo['lifespan'])
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedAct)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDorm.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedAct.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedAct.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/acmLifetimeSubject2017.pdf", bbox_inches = "tight")
print('ACM group: lifetime, limited to 10 years, of the repositories with main subject and first commit before 2018')
plt.show()

In [None]:
# acm group, first commit before 2018
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [
    {'live':True}, 
    {'group': {'$in': ['acm']}},
    {'first_commit':{'$regex': r'2008|2009|2010|2011|2012|2013|2014|2015|2016|2017'}}
]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [
    {'live':False}, 
    {'group': {'$in': ['acm']}},
    {'first_commit':{'$regex': r'2008|2009|2010|2011|2012|2013|2014|2015|2016|2017'}}
]})
for repo in repos:
    dormant.append(repo['lifespan'])
    #if repo['lifespan']//365 == -1:
    #    print(repo)
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

sortedAct = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDorm = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedAct)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDorm.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedAct.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedAct.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )
#plt.savefig("eval/acmLifetimeAll2017.pdf", bbox_inches = "tight")
print('ACM group: lifetime, limited to 10 years, of repositories with first commit before 2018:')
plt.show()

In [None]:
# arxiv group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['arxiv']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['arxiv']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedActive.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/arxivLifetimeSubject.pdf", bbox_inches = "tight")
print('arXiv group: lifetime of the repositories with main subject:')
plt.show()

In [None]:
# arxiv group
# description of the subsample - repository lifetime sorted by life and dormant
active = []
repos = rs_repo_table.get_entries({'$and': [{'live':True}, {'group': {'$in': ['arxiv']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    active.append(repo['lifespan'])
active = [x//365 for x in active]
dormant = []
repos = rs_repo_table.get_entries({'$and': [{'live':False}, {'group': {'$in': ['arxiv']}}, {'main_subject':{'$exists':True}}]})
for repo in repos:
    dormant.append(repo['lifespan'])
dormant = [x//365 for x in dormant]
keys_active = collections.Counter(active).keys() # equals to list(set(words))
values_active = collections.Counter(active).values() # counts the elements' frequency
keys_dormant = collections.Counter(dormant).keys() # equals to list(set(words))
values_dormant = collections.Counter(dormant).values() # counts the elements' frequency
activeDict = dict(zip(keys_active,values_active))
dormantDict = dict(zip(keys_dormant,values_dormant))

for key in activeDict:
    if not key in dormantDict:
        dormantDict.update({key:0})
for key in dormantDict:
    if not key in activeDict:
        activeDict.update({key:0})

sortedActive = collections.OrderedDict(sorted(activeDict.items()))
sortedDormant = collections.OrderedDict(sorted(dormantDict.items()))
sortedActive = {k: sortedActive[k] for k in sortedActive.keys() if k < 11}
sortedDormant = {k: sortedDormant[k] for k in sortedDormant.keys() if k < 11}

N = len(sortedActive)
ind = np.arange(N)  # the x locations for the groups
width = 0.27       # the width of the bars

fig = plt.figure()
ax = fig.add_subplot(111)

yvals = [int(v) for v in sortedDormant.values()]
rects1 = ax.bar(ind, yvals, width)
zvals = [int(v) for v in sortedActive.values()]
rects2 = ax.bar(ind+width, zvals, width)

ax.set_ylabel('Repositories')
ax.set_xticks(ind+width)
ax.set_xticklabels( [k for k in sortedActive.keys()] )
ax.legend( (rects1[0], rects2[0]), ('dormant', 'active') )

#plt.savefig("eval/arxivLifetimeSubject10.pdf", bbox_inches = "tight")
print('arXiv group: lifetime, limited to 10 years, of the repositories with main subject:')
plt.show()