In [1]:
from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

import pandas as pd

In [2]:
# object for accessing the ElasticSearch instance
es = Elasticsearch('http://localhost:9200', verify_certs=False)

In [3]:
# run p2o.py to generate indices
import subprocess
subprocess.run(['p2o.py', '--enrich', '--index', 'ix_perceval_raw', '--index-enrich', 'ix_perceval', '-e', 'http://localhost:9200', '--no_inc', '--debug', 'git', 'https://github.com/grimoirelab/perceval'])

CompletedProcess(args=['p2o.py', '--enrich', '--index', 'ix_perceval_raw', '--index-enrich', 'ix_perceval', '-e', 'http://localhost:9200', '--no_inc', '--debug', 'git', 'https://github.com/grimoirelab/perceval'], returncode=0)

In [4]:
# build a query
s = Search(using=es, index='ix_perceval')
s.aggs.bucket('by_authors', 'terms', field='author_name', size=10000).metric('first_commit', 'min', field='author_date')
s = s.sort('author_date')
# execute the query
result = s.execute()

In [5]:
buckets_result = result['aggregations']['by_authors']['buckets']
buckets = []

In [6]:
for bucket in buckets_result:
    # divide by milliseconds
    first_commit = bucket['first_commit']['value']/1000
    buckets.append({'first_commit':datetime.utcfromtimestamp(first_commit), 'author':bucket['key'], 'total_commits':bucket['doc_count']})

In [7]:
# create dataframe
authors = pd.DataFrame.from_records(buckets)
authors

Unnamed: 0,author,first_commit,total_commits
0,Santiago Dueñas,2015-08-18 18:08:27,708
1,Valerio Cosentino,2017-09-14 12:14:04,169
2,Alberto Martín,2016-02-09 15:56:45,51
3,Alvaro del Castillo,2015-12-04 18:46:14,45
4,Jesus M. Gonzalez-Barahona,2015-12-31 19:16:25,18
5,valerio cosentino,2017-09-07 14:46:30,6
6,quan,2016-04-01 12:16:29,5
7,Miguel Ángel Fernández,2018-02-12 12:56:11,3
8,camillem,2016-03-28 11:08:04,2
9,valerio,2017-10-10 16:27:29,2


In [8]:
# sort values by date of first commit
authors.sort_values(by='first_commit', ascending=False, inplace=True)

In [9]:
# print the dataframe
authors

Unnamed: 0,author,first_commit,total_commits
7,Miguel Ángel Fernández,2018-02-12 12:56:11,3
12,Israel Herraiz,2018-01-09 15:40:57,1
16,david,2017-12-07 18:54:53,1
11,David Pose Fernández,2017-11-03 08:23:54,1
10,David Esler,2017-10-17 22:46:36,1
9,valerio,2017-10-10 16:27:29,2
1,Valerio Cosentino,2017-09-14 12:14:04,169
5,valerio cosentino,2017-09-07 14:46:30,6
15,Stephan Barth,2017-01-09 16:52:56,1
14,Luis Cañas Díaz,2016-09-26 12:30:22,1


In [10]:
no_of_commits = authors[['author', 'total_commits']]
no_of_commits

Unnamed: 0,author,total_commits
7,Miguel Ángel Fernández,3
12,Israel Herraiz,1
16,david,1
11,David Pose Fernández,1
10,David Esler,1
9,valerio,2
1,Valerio Cosentino,169
5,valerio cosentino,6
15,Stephan Barth,1
14,Luis Cañas Díaz,1


In [None]:
# find number of new authors per month
by_month = authors['first_commit'].groupby([authors.first_commit.dt.year, authors.first_commit.dt.month]).agg('count')

In [None]:
by_month