# Manuscripts: Re-visited

Manuscripts, currently, mostly only provides us with aggregations of data. It isn't flexible enough to let us play with data For example: sort the data by different filters and values.

Here, we will be experimenting what all can be done with the metrics. None of the previously written code will be used here so as to look at different ways and basically redesign the current code.

We'll still be looking at the [GMD metrics](https://github.com/chaoss/metrics/blob/master/2_Growth-Maturity-Decline.md)

In [30]:
import pandas as pd

from pprint import pprint

from elasticsearch import Elasticsearch

from elasticsearch_dsl import A, Q, Search
from elasticsearch_dsl.query import Match, MultiMatch

from datetime import date, timezone
from dateutil import parser, relativedelta

In [2]:
es = Elasticsearch("http://localhost:9200/")

github_index = "perceval_github"
git_index = "perceval_git"

In [None]:
class (object):
    def __init__(self, client, es_index):
        self.s = Search(using=client, index=es_index)
        self.filters = []
        self.queries = []
        self.aggregations = []
    
    def get_filters(self):
        if "filter" in self.s.to_dict().keys():
            return self.s.to_dict()['filter']
        else:
            return None
        
    def get_queries(self):
        if "query" in self.s.to_dict().keys():
            return self.s.to_dict()['query']
        else:
            return None
    
    def get_aggs(self):
        if "aggs" in self.s.to_dict().keys():
            return self.s.to_dict()['aggs']
        else:
            return None
    
    def

Let's talk about the kind of filters we want while looking at the metrics. 

Can we look at the metrics by seggregating them according to:
- Date?
 - days
 - weeks
 - months
 - years


- organizations?
 - if people from multiple organizations are a part of the project, then we might need to see how they play along and which org is having the most influence?

### Issue Resolution

In [10]:
# open issues:

s = Search(using=es, index=github_index)
q1 = Q("match", **{"item_type":"issue"})
q2 = Q("match", **{"state": "open"})
q = q1 & q2
s = s.query(q)
agg = A("cardinality", field="id_in_repo")
s.aggs.bucket("num_open_issues", agg)
s = s.extra(size=0)

response = s.execute()
response.aggregations.num_open_issues.value

23

In [16]:
s.to_dict()

{'aggs': {'num_open_issues': {'cardinality': {'field': 'id_in_repo'}}},
 'query': {'bool': {'must': [{'match': {'item_type': 'issue'}},
    {'match': {'state': 'open'}}]}},
 'size': 0}

In [5]:
# closed issues

s = Search(using=es, index=github_index)
q3 = Q("match", **{"item_type":"issue"})
q4 = Q("match", **{"state": "closed"})
q = q3 & q4
s = s.query(q)
agg = A("cardinality", field="id_in_repo")
s.aggs.bucket("num_open_issues", agg)
s = s.extra(size=0)

response = s.execute()
response.aggregations.num_open_issues.value

113

In [29]:
sum([34, 2, 44, 32, 124, 678, 432, 32, 1])/9

153.22222222222223

In [26]:
# open issue age

s = Search(using=es, index=github_index)
q0 = Q("match_all")
q1 = Q("match", **{"item_type":"issue"})
q2 = Q("match", **{"state": "open"})
q = q0 & q1 & q2
s = s.query(q)
agg = A("cardinality", field="id_in_repo")
s.aggs.bucket("num_open_issues", agg)
s = s.extra(_source=['time_open_days', 'id_in_repo'])

response = s.execute()

In [27]:
s.to_dict()

{'_source': ['time_open_days', 'id_in_repo'],
 'aggs': {'num_open_issues': {'cardinality': {'field': 'id_in_repo'}}},
 'query': {'bool': {'must': [{'match': {'item_type': 'issue'}},
    {'match': {'state': 'open'}}]}}}

In [23]:
open_issue_age = pd.DataFrame([hit['_source'] for hit in response.hits.hits])

In [24]:
open_issue_age

Unnamed: 0,id_in_repo,time_open_days
0,58,617.82
1,104,507.92
2,319,99.84
3,91,557.9
4,139,374.73
5,217,173.08
6,331,88.27
7,19,795.16
8,28,788.04
9,74,586.77


### Code Development

In [28]:
# Total commits

s = Search(using=es, index=git_index)
#q = Q("match", **{"files": 0})
#s = s.query(~q)
a = A("cardinality", field="hash", precision_threshold=2000)
s.aggs.bucket("total_commits", a)
#s = s.extra(size=0)
s = s.extra(_source=["hash", "commit_date"])
s = s.extra(sort={"commit_date":"asc"})
response = s.execute()

In [29]:
response.aggregations.total_commits.value

1186

When you go to the [perceval github repo](https://github.com/chaoss/grimoirelab-perceval), you'll see that actually 1182 commit are present. That maybe because of some empty commit messages. 

In [8]:
# commits by months
s = Search(using=es, index=git_index)
a = A("date_histogram", field="commit_date", interval="month")
s.aggs.bucket("commits_by_weeks", a)
response = s.execute()

In [9]:
response.aggregations.commits_by_weeks.buckets

[{'key_as_string': '2015-08-01T00:00:00.000Z', 'key': 1438387200000, 'doc_count': 16}, {'key_as_string': '2015-09-01T00:00:00.000Z', 'key': 1441065600000, 'doc_count': 0}, {'key_as_string': '2015-10-01T00:00:00.000Z', 'key': 1443657600000, 'doc_count': 0}, {'key_as_string': '2015-11-01T00:00:00.000Z', 'key': 1446336000000, 'doc_count': 46}, {'key_as_string': '2015-12-01T00:00:00.000Z', 'key': 1448928000000, 'doc_count': 34}, {'key_as_string': '2016-01-01T00:00:00.000Z', 'key': 1451606400000, 'doc_count': 60}, {'key_as_string': '2016-02-01T00:00:00.000Z', 'key': 1454284800000, 'doc_count': 152}, {'key_as_string': '2016-03-01T00:00:00.000Z', 'key': 1456790400000, 'doc_count': 98}, {'key_as_string': '2016-04-01T00:00:00.000Z', 'key': 1459468800000, 'doc_count': 44}, {'key_as_string': '2016-05-01T00:00:00.000Z', 'key': 1462060800000, 'doc_count': 38}, {'key_as_string': '2016-06-01T00:00:00.000Z', 'key': 1464739200000, 'doc_count': 66}, {'key_as_string': '2016-07-01T00:00:00.000Z', 'key': 1

In [24]:
# Lines of code changed

s = Search(using=es, index=git_index)
#q = Q("match", **{"files": 0})
#s = s.query(~q)
a1 = A("sum", field="lines_changed")
a2 = A("sum", field="lines_added")
a3 = A("sum", field="lines_removed")
s.aggs.bucket("total_lines_changed", a1)
s.aggs.bucket("total_lines_added", a2)
s.aggs.bucket("total_lines_removed", a3)
s = s.extra(size=0)
#s = s.extra(_source=["hash", "commit_date"])
#s = s.extra(sort={"commit_date":"asc"})
response = s.execute()

print("Total lines changed: ", response.aggregations.total_lines_changed.value)
print("Total lines added: ", response.aggregations.total_lines_added.value)
print("Total lines removed: ", response.aggregations.total_lines_removed.value)

Total lines changed:  354358.0
Total lines added:  265068.0
Total lines removed:  89290.0


### Community Growth

In [43]:
# Number of contributors

s = Search(using=es, index=git_index)
a = A("terms", field="author_name")
a.metric("lines_changed", "sum", field="lines_changed")
a.metric("lines_added", "sum", field="lines_added")
s.aggs.bucket("contributors", a)
response = s.execute()

In [44]:
pprint(response.aggregations.contributors.buckets)

[{'key': 'Santiago Dueñas', 'doc_count': 1494, 'lines_added': {'value': 91122.0}, 'lines_changed': {'value': 123756.0}}, {'key': 'Valerio Cosentino', 'doc_count': 567, 'lines_added': {'value': 86338.0}, 'lines_changed': {'value': 138388.0}}, {'key': 'Alberto Martín', 'doc_count': 102, 'lines_added': {'value': 43562.0}, 'lines_changed': {'value': 45724.0}}, {'key': 'Alvaro del Castillo', 'doc_count': 102, 'lines_added': {'value': 30664.0}, 'lines_changed': {'value': 31438.0}}, {'key': 'Jesus M. Gonzalez-Barahona', 'doc_count': 37, 'lines_added': {'value': 2058.0}, 'lines_changed': {'value': 2176.0}}, {'key': 'valerio cosentino', 'doc_count': 12, 'lines_added': {'value': 2548.0}, 'lines_changed': {'value': 3406.0}}, {'key': 'quan', 'doc_count': 10, 'lines_added': {'value': 7186.0}, 'lines_changed': {'value': 7308.0}}, {'key': 'Miguel Ángel Fernández', 'doc_count': 6, 'lines_added': {'value': 92.0}, 'lines_changed': {'value': 380.0}}, {'key': 'David Pose Fernández', 'doc_count': 4, 'lines