# Microtask 1
---
Produce a listing of the number of new committers per month, and the number of commits for each of them, as a table and as a CSV file. Use the GrimoireLab enriched index for git.

In [None]:
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

import subprocess
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# elasticsearch instance
es = Elasticsearch('http://localhost:9200', verify_certs=False)

In [None]:
# scikit-learn repo
repo = 'https://github.com/scikit-learn/scikit-learn.git'

In [None]:
# run p2o.py to get enriched indices
subprocess.run(['p2o.py', '--enrich', '--index', 'sklearn_raw', '--index-enrich', 'sklearn', '-e', 'http://localhost:9200', '--no_inc', '--debug', 'git', repo])

In [None]:
# frame a query
s = Search(using=es, index='sklearn')
s.aggs.bucket('by_authors', 'terms', field='author_name', size=15000).metric('first_commit', 'min', field='author_date')
s = s.sort('author_date')

# execute the query
result = s.execute()

In [None]:
buckets_result = result['aggregations']['by_authors']['buckets']
buckets = []

In [None]:
for bucket in buckets_result:
    # divide by milliseconds
    first_commit = bucket['first_commit']['value']/1000
    buckets.append({'first_commit':datetime.utcfromtimestamp(first_commit), 'author':bucket['key'], 'total_commits':bucket['doc_count']})

In [None]:
authors = pd.DataFrame.from_records(buckets)
authors

In [None]:
authors.sort_values(by='first_commit', ascending=False, inplace=True)
authors.index = range(len(authors))
authors.head()

In [None]:
by_month = authors[['first_commit', 'total_commits']].groupby([authors.first_commit.dt.year, authors.first_commit.dt.month]).agg(['min', 'max', 'count'])

In [None]:
by_month

In [None]:
by_month['first_commit']['count']

In [None]:
years = list(authors.first_commit.dt.year)
months = list(authors.first_commit.dt.month)

In [None]:
datestr = []
for i in range(len(authors)):
    if months[i] < 10:
        months[i] = str(0) + str(months[i])
    datestr.append(str(years[i]) + str(months[i]))

In [None]:
authors['datestr'] = pd.Series(datestr, index=authors.index)

In [None]:
authors = authors[['author', 'datestr', 'first_commit', 'total_commits']]
authors

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))
ax.plot(authors.datestr, authors.total_commits, 'k-')
ax.set_ylabel('Total number of commits')
ax.set_xlabel('Month')
ax.set_xticklabels([])
plt.show()