In [144]:
from collections import Counter
import pandas as pd

In [138]:
with open('git_log_20220522.txt', 'r') as file:
    lines = file.read().split("\n")

In [139]:
authors = []

## Parse Dataverse log for the list of contributors

In [140]:
for line in lines:
    if line.startswith('Author: '):
        authors.append(line[8:])

In [141]:
author_counts = Counter(authors)

In [142]:
author_dict = dict()
for author, count in author_counts.items():
    name = author.split(' <')[0]
    name = name.title()
    email = author.split(' <')[1][:-1]
    
    if name in author_dict.keys():
        if count > author_dict[name]["count"]:
            author_dict[name]["count"] = count
            author_dict[name]["email"] = email
    else:
        author_dict[name] = dict()
        author_dict[name]["email"] = email
        author_dict[name]["count"] = count

In [143]:
len(author_dict)

241

## Clean up

In [176]:
df = pd.DataFrame(author_dict).T

In [177]:
df = df.reset_index(names="name")

In [178]:
df["first"] = df["name"].str.split(' ').str[0]

In [179]:
df["last"] = df.apply(
    lambda row: row["name"][len(row["first"]) + 1:], axis=1)

## Select for full names and surnames

In [180]:
df = df[df["last"]!=""]

## Remove duplicates

In [181]:
df[df["email"].duplicated(keep=False)]

Unnamed: 0,name,email,count,first,last
14,Robert Treacy,rtreacy@hmdc.harvard.edu,53,Robert,Treacy
49,Bob Treacy,rtreacy@hmdc.harvard.edu,5,Bob,Treacy
85,Slava Tykhonov,4tikhonov@gmail.com,1,Slava,Tykhonov
94,Peter Kiraly,peter.kiraly@gwdg.de,104,Peter,Kiraly
95,Ken Mankoff,mankoff@gmail.com,3,Ken,Mankoff
96,Kenneth D. Mankoff,mankoff@gmail.com,1,Kenneth,D. Mankoff
126,Király Péter,peter.kiraly@gwdg.de,2,Király,Péter
131,Carlos Mcgregor,c.mcgregormuro@mail.utoronto.ca,13,Carlos,Mcgregor
145,Carlos Mc Gregor,c.mcgregormuro@mail.utoronto.ca,1,Carlos,Mc Gregor
177,Sarah Ferry,ferrys@bu.edu,112,Sarah,Ferry


In [182]:
duplicates = [49, 85, 126, 95, 145, 201]

In [183]:
df = df.drop(duplicates)

In [189]:
df = df.drop(columns = ["name", "count"])

In [190]:
len(df)

127

## Sort by last name

In [191]:
df_sorted = df.sort_values(by='last')

## Export CFF

In [196]:
formatted_data = ""
for index, row in df_sorted.iterrows():
    formatted_data += \
f"""  - given-names: {row['first']}
    family-names: {row['last']}
    email: {row['email']}
"""

In [199]:
draft = f"""cff-version: 1.2.0
title: The Dataverse Software
message: Dataverse is an open source software platform for sharing, finding, citing, and preserving research data.
type: software
authors:
{ formatted_data 
}repository-code: 'https://github.com/IQSS/dataverse'
url: 'https://dataverse.org'
abstract: Dataverse is an open-source platform developed by the Data Science and Products team at the Institute for Quantitative Social Science and the Dataverse community. It's designed to promote the sharing, preservation, citation, and analysis of research data. In Dataverse, datasets are organized by flexible containers called "dataverses", which can hold one or more datasets or collections. Each dataset contains metadata that describes the data, as well as the actual deposited data and documentation files. Researchers can use Dataverse to share their data, collaborate with others, and meet the data data sharing requirements of their institutions or funders. They can store data in a wide variety of formats, control who has access to the data, and use the API for creating new modules and tools. Libraries and institutions can use Dataverse as a publishing system to provide citation, persistent identifiers (such as DOIs) and long-term access to datasets.
keywords:
  - research data
  - open-source platform
  - data repository
  - open data
  - open science
license: Apache-2.0
"""

In [200]:
with open('CITATION.cff', 'w') as file:
    file.write(draft)